R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Problem 4 - Build Machine learning model for medical diagnoses

Loading the necessary libraries

library(rpart)  #partining of DT
library(caret) #To partition the data into test and training
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rpart.plot)
library(data.tree)  
library(caTools) #Manipulation of data
library(ggplot2)
library(tidyr)
library(outliers)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 0x0006): Library not loaded: '/opt/X11/lib/libSM.6.dylib'
##   Referenced from: '/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/modules/R_X11.so'
##   Reason: tried: '/opt/X11/lib/libSM.6.dylib' (no such file), '/Library/Frameworks/R.framework/Resources/lib/libSM.6.dylib' (no such file), '/Library/Java/JavaVirtualMachines/jdk-17.0.1+12/Contents/Home/lib/server/libSM.6.dylib' (no such file)
## tcltk DLL is linked to '/opt/X11/lib/libX11.6.dylib'
## Could not load tcltk.  Will use slower R code instead.
## Loading required package: RSQLite
library(dlookr)
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:base':
## 
##     transform
library(corrplot)
## corrplot 0.92 loaded
library(aqp)
## This is aqp 1.42
## 
## Attaching package: 'aqp'
## The following objects are masked from 'package:dplyr':
## 
##     combine, slice
library(soilDB)
library('pROC') 
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ROCR)
library("randomForest")
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:aqp':
## 
##     combine
## The following object is masked from 'package:outliers':
## 
##     outlier
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(RColorBrewer)

Importing all the datasets

df_testX <- read.csv("testX.csv", header = FALSE)
df_testY <- read.csv("testY.csv", header = FALSE)
df_trainX <- read.csv("trainX.csv", header = FALSE)
df_trainY <- read.csv("trainY.csv", header = FALSE)
head(df_testX)
head(df_testY)
head(df_trainX)
head(df_trainY)

Part a

Renaming all the columns

names(df_testX) <- c('radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean',
                     'concavity_mean','concave_points_mean','symmetry_mean','fractal_dimension_mean','radius_se',
                     'texure_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se',
                     'concave_points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst',
                     'perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst',
                     'concave_points_worst','symmetry_worst','fractal_dimension_worst')

names(df_trainX) <- c('radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean',
                     'concavity_mean','concave_points_mean','symmetry_mean','fractal_dimension_mean','radius_se',
                     'texure_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se',
                     'concave_points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst',
                     'perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst',
                     'concave_points_worst','symmetry_worst','fractal_dimension_worst')


## Using `cbind()` function to join df_testX and df_textY and df_trainX and df_trainY

df_test <- cbind(df_testX,df_testY)
df_train <- cbind(df_trainX,df_trainY)
df_test
df_train
df_test <- df_test %>% rename(diagnosis = V1)
df_train <- df_train %>% rename(diagnosis = V1)

Univariate analysis - Finding the summary statistics of each column

#structure of dataframe
str(df_test)
## 'data.frame':    57 obs. of  31 variables:
##  $ radius_mean            : num  13.4 12.2 14.8 14.6 14.6 ...
##  $ texture_mean           : num  30.7 20.5 17.7 15.2 23.3 ...
##  $ perimeter_mean         : num  86.3 77.2 95.9 95.8 94 ...
##  $ area_mean              : num  557 459 675 652 665 ...
##  $ smoothness_mean        : num  0.0925 0.0801 0.0918 0.1132 0.0868 ...
##  $ compactness_mean       : num  0.0743 0.0404 0.0889 0.1339 0.0664 ...
##  $ concavity_mean         : num  0.0282 0.0238 0.0407 0.0997 0.0839 ...
##  $ concave_points_mean    : num  0.0326 0.0177 0.0226 0.0706 0.0527 ...
##  $ symmetry_mean          : num  0.138 0.174 0.189 0.212 0.163 ...
##  $ fractal_dimension_mean : num  0.0602 0.0568 0.0589 0.0635 0.0542 ...
##  $ radius_se              : num  0.341 0.192 0.22 0.511 0.416 ...
##  $ texure_se              : num  1.924 1.571 0.622 0.737 1.627 ...
##  $ perimeter_se           : num  2.29 1.18 1.48 3.81 2.91 ...
##  $ area_se                : num  28.9 14.7 19.8 42.8 33 ...
##  $ smoothness_se          : num  0.00584 0.00508 0.0048 0.00551 0.00831 ...
##  $ compactness_se         : num  0.0125 0.0061 0.0117 0.0441 0.0174 ...
##  $ concavity_se           : num  0.00794 0.01069 0.01758 0.04436 0.03389 ...
##  $ concave_points_se      : num  0.00913 0.0068 0.0069 0.01623 0.01576 ...
##  $ symmetry_se            : num  0.0156 0.0145 0.0225 0.0243 0.0174 ...
##  $ fractal_dimension_se   : num  0.00298 0.00153 0.00197 0.00484 0.00287 ...
##  $ radius_worst           : num  15.1 13.3 16.4 16.3 15.8 ...
##  $ texture_worst          : num  41.6 32.8 22.7 18.2 31.7 ...
##  $ perimeter_worst        : num  96.7 84.6 105.9 109.4 102.2 ...
##  $ area_worst             : num  706 548 830 804 758 ...
##  $ smoothness_worst       : num  0.117 0.112 0.123 0.128 0.131 ...
##  $ compactness_worst      : num  0.1421 0.0886 0.1881 0.3089 0.1581 ...
##  $ concavity_worst        : num  0.07 0.115 0.206 0.26 0.268 ...
##  $ concave_points_worst   : num  0.0776 0.0743 0.0831 0.1397 0.1359 ...
##  $ symmetry_worst         : num  0.22 0.269 0.36 0.315 0.248 ...
##  $ fractal_dimension_worst: num  0.0767 0.0688 0.0728 0.0847 0.0684 ...
##  $ diagnosis              : int  0 0 0 0 1 1 0 1 0 1 ...
str(df_train)
## 'data.frame':    455 obs. of  31 variables:
##  $ radius_mean            : num  12.8 12.9 17.9 19.2 13.7 ...
##  $ texture_mean           : num  22.3 13.3 24.5 26.6 15.2 ...
##  $ perimeter_mean         : num  85.3 82.8 115.2 126.2 88.3 ...
##  $ area_mean              : num  503 505 999 1138 581 ...
##  $ smoothness_mean        : num  0.1088 0.1134 0.0886 0.102 0.0827 ...
##  $ compactness_mean       : num  0.1799 0.0883 0.0703 0.1453 0.0755 ...
##  $ concavity_mean         : num  0.1695 0.038 0.057 0.1921 0.0425 ...
##  $ concave_points_mean    : num  0.0686 0.034 0.0474 0.0966 0.0247 ...
##  $ symmetry_mean          : num  0.212 0.154 0.154 0.19 0.179 ...
##  $ fractal_dimension_mean : num  0.0725 0.0648 0.0551 0.0622 0.059 ...
##  $ radius_se              : num  0.306 0.221 0.421 0.636 0.14 ...
##  $ texure_se              : num  1.069 1.042 1.433 1.001 0.542 ...
##  $ perimeter_se           : num  2.26 1.61 2.77 4.32 1.1 ...
##  $ area_se                : num  25.1 16.6 45.8 69.7 11.3 ...
##  $ smoothness_se          : num  0.00698 0.00591 0.00544 0.00739 0.00521 ...
##  $ compactness_se         : num  0.0386 0.0202 0.0117 0.0245 0.0298 ...
##  $ concavity_se           : num  0.0468 0.019 0.0162 0.0399 0.0244 ...
##  $ concave_points_se      : num  0.01499 0.01011 0.00852 0.01293 0.00836 ...
##  $ symmetry_se            : num  0.0168 0.012 0.0142 0.0143 0.0182 ...
##  $ fractal_dimension_se   : num  0.00562 0.00311 0.00275 0.00345 0.00487 ...
##  $ radius_worst           : num  15.2 14 20.9 23.7 14.5 ...
##  $ texture_worst          : num  30.1 21.1 34.7 35.9 19.6 ...
##  $ perimeter_worst        : num  105.3 92.8 135.1 159.8 98 ...
##  $ area_worst             : num  706 600 1320 1724 657 ...
##  $ smoothness_worst       : num  0.178 0.155 0.132 0.178 0.128 ...
##  $ compactness_worst      : num  0.534 0.223 0.181 0.384 0.31 ...
##  $ concavity_worst        : num  0.628 0.179 0.208 0.575 0.257 ...
##  $ concave_points_worst   : num  0.198 0.116 0.114 0.187 0.105 ...
##  $ symmetry_worst         : num  0.341 0.238 0.25 0.326 0.339 ...
##  $ fractal_dimension_worst: num  0.1243 0.0855 0.0795 0.0972 0.0964 ...
##  $ diagnosis              : int  1 0 1 1 0 1 0 0 1 0 ...
#Findings : There are 31 fields with 57 rows in train dataset
          # There are 31 fields with 455 rows in train dataset

#summary
summary(df_test)
##   radius_mean      texture_mean   perimeter_mean     area_mean     
##  Min.   : 8.597   Min.   :10.38   Min.   : 54.09   Min.   : 221.2  
##  1st Qu.:11.890   1st Qu.:15.24   1st Qu.: 77.22   1st Qu.: 440.6  
##  Median :13.530   Median :18.61   Median : 87.91   Median : 565.4  
##  Mean   :14.357   Mean   :18.65   Mean   : 93.53   Mean   : 674.0  
##  3rd Qu.:17.140   3rd Qu.:20.58   3rd Qu.:115.00   3rd Qu.: 912.7  
##  Max.   :22.270   Max.   :31.12   Max.   :152.80   Max.   :1509.0  
##  smoothness_mean   compactness_mean  concavity_mean    concave_points_mean
##  Min.   :0.07026   Min.   :0.03212   Min.   :0.00000   Min.   :0.00000    
##  1st Qu.:0.08713   1st Qu.:0.06136   1st Qu.:0.02819   1st Qu.:0.02260    
##  Median :0.09831   Median :0.09752   Median :0.06636   Median :0.03438    
##  Mean   :0.09808   Mean   :0.10531   Mean   :0.09124   Mean   :0.05327    
##  3rd Qu.:0.10740   3rd Qu.:0.13100   3rd Qu.:0.13350   3rd Qu.:0.08293    
##  Max.   :0.13260   Max.   :0.27760   Max.   :0.42640   Max.   :0.18230    
##  symmetry_mean    fractal_dimension_mean   radius_se        texure_se     
##  Min.   :0.1342   Min.   :0.05395        Min.   :0.1312   Min.   :0.3602  
##  1st Qu.:0.1619   1st Qu.:0.05886        1st Qu.:0.2522   1st Qu.:0.8570  
##  Median :0.1792   Median :0.06140        Median :0.3478   Median :1.0380  
##  Mean   :0.1859   Mean   :0.06248        Mean   :0.4503   Mean   :1.1721  
##  3rd Qu.:0.1973   3rd Qu.:0.06491        3rd Qu.:0.5449   3rd Qu.:1.4750  
##  Max.   :0.3040   Max.   :0.07871        Max.   :1.2150   Max.   :2.7770  
##   perimeter_se       area_se        smoothness_se      compactness_se    
##  Min.   : 1.107   Min.   :  9.438   Min.   :0.003271   Min.   :0.004711  
##  1st Qu.: 1.696   1st Qu.: 18.950   1st Qu.:0.004796   1st Qu.:0.012700  
##  Median : 2.567   Median : 31.010   Median :0.005841   Median :0.018850  
##  Mean   : 3.152   Mean   : 46.000   Mean   :0.006489   Mean   :0.022830  
##  3rd Qu.: 3.814   3rd Qu.: 50.960   3rd Qu.:0.006662   3rd Qu.:0.030290  
##  Max.   :10.050   Max.   :199.700   Max.   :0.020750   Max.   :0.086680  
##   concavity_se     concave_points_se  symmetry_se      fractal_dimension_se
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.01057   Min.   :0.0009502   
##  1st Qu.:0.01390   1st Qu.:0.00842   1st Qu.:0.01447   1st Qu.:0.0022680   
##  Median :0.02636   Median :0.01069   Median :0.01731   Median :0.0029850   
##  Mean   :0.02761   Mean   :0.01130   Mean   :0.02005   Mean   :0.0034004   
##  3rd Qu.:0.03437   3rd Qu.:0.01365   3rd Qu.:0.02370   3rd Qu.:0.0042250   
##  Max.   :0.10400   Max.   :0.02480   Max.   :0.06146   Max.   :0.0074440   
##   radius_worst    texture_worst   perimeter_worst    area_worst    
##  Min.   : 8.952   Min.   :12.49   Min.   : 56.65   Min.   : 240.1  
##  1st Qu.:13.340   1st Qu.:20.14   1st Qu.: 85.10   1st Qu.: 547.8  
##  Median :15.750   Median :24.62   Median :102.50   Median : 758.2  
##  Mean   :16.967   Mean   :25.28   Mean   :112.20   Mean   : 964.9  
##  3rd Qu.:20.010   3rd Qu.:28.39   3rd Qu.:134.90   3rd Qu.:1227.0  
##  Max.   :32.490   Max.   :47.16   Max.   :214.00   Max.   :3432.0  
##  smoothness_worst  compactness_worst concavity_worst  concave_points_worst
##  Min.   :0.08484   Min.   :0.05332   Min.   :0.0000   Min.   :0.00000     
##  1st Qu.:0.11370   1st Qu.:0.14210   1st Qu.:0.1091   1st Qu.:0.07407     
##  Median :0.13470   Median :0.23020   Median :0.2604   Median :0.11380     
##  Mean   :0.13204   Mean   :0.25247   Mean   :0.2771   Mean   :0.12033     
##  3rd Qu.:0.14780   3rd Qu.:0.33090   3rd Qu.:0.3779   3rd Qu.:0.16420     
##  Max.   :0.18510   Max.   :0.69970   Max.   :0.9608   Max.   :0.29100     
##  symmetry_worst   fractal_dimension_worst   diagnosis     
##  Min.   :0.1890   Min.   :0.06037         Min.   :0.0000  
##  1st Qu.:0.2542   1st Qu.:0.07191         1st Qu.:0.0000  
##  Median :0.2889   Median :0.07875         Median :0.0000  
##  Mean   :0.2950   Mean   :0.08268         Mean   :0.4386  
##  3rd Qu.:0.3216   3rd Qu.:0.08579         3rd Qu.:1.0000  
##  Max.   :0.4761   Max.   :0.14020         Max.   :1.0000
summary(df_train)
##   radius_mean      texture_mean   perimeter_mean     area_mean     
##  Min.   : 6.981   Min.   : 9.71   Min.   : 43.79   Min.   : 143.5  
##  1st Qu.:11.615   1st Qu.:16.21   1st Qu.: 74.70   1st Qu.: 412.6  
##  Median :13.280   Median :18.83   Median : 85.98   Median : 545.2  
##  Mean   :14.105   Mean   :19.39   Mean   : 91.86   Mean   : 654.6  
##  3rd Qu.:15.815   3rd Qu.:21.93   3rd Qu.:103.75   3rd Qu.: 785.6  
##  Max.   :28.110   Max.   :39.28   Max.   :188.50   Max.   :2501.0  
##  smoothness_mean   compactness_mean  concavity_mean    concave_points_mean
##  Min.   :0.05263   Min.   :0.01938   Min.   :0.00000   Min.   :0.00000    
##  1st Qu.:0.08618   1st Qu.:0.06616   1st Qu.:0.03041   1st Qu.:0.01988    
##  Median :0.09578   Median :0.09453   Median :0.06476   Median :0.03390    
##  Mean   :0.09623   Mean   :0.10527   Mean   :0.09028   Mean   :0.04900    
##  3rd Qu.:0.10470   3rd Qu.:0.13060   3rd Qu.:0.13220   3rd Qu.:0.07402    
##  Max.   :0.16340   Max.   :0.34540   Max.   :0.42680   Max.   :0.20120    
##  symmetry_mean    fractal_dimension_mean   radius_se        texure_se     
##  Min.   :0.1060   Min.   :0.04996        Min.   :0.1115   Min.   :0.3621  
##  1st Qu.:0.1626   1st Qu.:0.05799        1st Qu.:0.2321   1st Qu.:0.8281  
##  Median :0.1799   Median :0.06183        Median :0.3163   Median :1.1080  
##  Mean   :0.1809   Mean   :0.06301        Mean   :0.4020   Mean   :1.2274  
##  3rd Qu.:0.1958   3rd Qu.:0.06639        3rd Qu.:0.4695   3rd Qu.:1.4795  
##  Max.   :0.2906   Max.   :0.09744        Max.   :2.8730   Max.   :4.8850  
##   perimeter_se       area_se        smoothness_se      compactness_se    
##  Min.   : 0.757   Min.   :  6.802   Min.   :0.001713   Min.   :0.002252  
##  1st Qu.: 1.645   1st Qu.: 17.670   1st Qu.:0.005228   1st Qu.:0.013710  
##  Median : 2.279   Median : 23.940   Median :0.006458   Median :0.021150  
##  Mean   : 2.856   Mean   : 40.172   Mean   :0.007162   Mean   :0.026047  
##  3rd Qu.: 3.307   3rd Qu.: 44.935   3rd Qu.:0.008370   3rd Qu.:0.033065  
##  Max.   :21.980   Max.   :542.200   Max.   :0.031130   Max.   :0.135400  
##   concavity_se     concave_points_se   symmetry_se       fractal_dimension_se
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.007882   Min.   :0.0008948   
##  1st Qu.:0.01574   1st Qu.:0.007759   1st Qu.:0.015220   1st Qu.:0.0022795   
##  Median :0.02626   Median :0.011030   Median :0.018970   Median :0.0032370   
##  Mean   :0.03288   Mean   :0.011887   Mean   :0.020687   Mean   :0.0038707   
##  3rd Qu.:0.04290   3rd Qu.:0.014960   3rd Qu.:0.023705   3rd Qu.:0.0045715   
##  Max.   :0.39600   Max.   :0.052790   Max.   :0.078950   Max.   :0.0298400   
##   radius_worst   texture_worst   perimeter_worst    area_worst    
##  Min.   : 7.93   Min.   :12.02   Min.   : 50.41   Min.   : 185.2  
##  1st Qu.:12.97   1st Qu.:21.09   1st Qu.: 83.80   1st Qu.: 510.1  
##  Median :14.90   Median :25.44   Median : 97.58   Median : 677.3  
##  Mean   :16.22   Mean   :25.78   Mean   :106.95   Mean   : 877.5  
##  3rd Qu.:18.66   3rd Qu.:29.99   3rd Qu.:125.65   3rd Qu.:1057.0  
##  Max.   :36.04   Max.   :49.54   Max.   :251.20   Max.   :4254.0  
##  smoothness_worst  compactness_worst concavity_worst  concave_points_worst
##  Min.   :0.07117   Min.   :0.02729   Min.   :0.0000   Min.   :0.00000     
##  1st Qu.:0.11735   1st Qu.:0.14860   1st Qu.:0.1203   1st Qu.:0.06326     
##  Median :0.13120   Median :0.21700   Median :0.2299   Median :0.10170     
##  Mean   :0.13278   Mean   :0.25871   Mean   :0.2769   Mean   :0.11502     
##  3rd Qu.:0.14635   3rd Qu.:0.34305   3rd Qu.:0.3900   3rd Qu.:0.16650     
##  Max.   :0.22260   Max.   :1.05800   Max.   :1.2520   Max.   :0.29030     
##  symmetry_worst   fractal_dimension_worst   diagnosis     
##  Min.   :0.1565   Min.   :0.05504         Min.   :0.0000  
##  1st Qu.:0.2510   1st Qu.:0.07224         1st Qu.:0.0000  
##  Median :0.2826   Median :0.08052         Median :0.0000  
##  Mean   :0.2905   Mean   :0.08464         Mean   :0.3714  
##  3rd Qu.:0.3181   3rd Qu.:0.09219         3rd Qu.:1.0000  
##  Max.   :0.6638   Max.   :0.20750         Max.   :1.0000

Data cleaning

# Finding the missing values in the dataset using colSums

colSums(is.na(df_train))
##             radius_mean            texture_mean          perimeter_mean 
##                       0                       0                       0 
##               area_mean         smoothness_mean        compactness_mean 
##                       0                       0                       0 
##          concavity_mean     concave_points_mean           symmetry_mean 
##                       0                       0                       0 
##  fractal_dimension_mean               radius_se               texure_se 
##                       0                       0                       0 
##            perimeter_se                 area_se           smoothness_se 
##                       0                       0                       0 
##          compactness_se            concavity_se       concave_points_se 
##                       0                       0                       0 
##             symmetry_se    fractal_dimension_se            radius_worst 
##                       0                       0                       0 
##           texture_worst         perimeter_worst              area_worst 
##                       0                       0                       0 
##        smoothness_worst       compactness_worst         concavity_worst 
##                       0                       0                       0 
##    concave_points_worst          symmetry_worst fractal_dimension_worst 
##                       0                       0                       0 
##               diagnosis 
##                       0
#Findings : There are no missing records in the train dataset

# Finding outliers and treating them

# Plotting Boxplot to find the outliers
boxplot(df_train,las=3.8,main = "Outlier detection of all columns using box plot")

## Selecting z-score over inter-quartile range because data has lot of outliers and if we use inter_quartile range then we would be removing around 236 records due to this dataset population would decrease dramatically.

z_scores <- as.data.frame(sapply(df_train, function(df_train) (abs(df_train-mean(df_train))/sd(df_train))))
Final_train_data <- df_train[!rowSums(z_scores>3), ]
dim(Final_train_data)
## [1] 399  31
# Removed 56 outliers using Z-score method

boxplot(Final_train_data,las=3.8,main = "Box plot after outlier treatment")

Bivariate analysis

# Plot Result vs having_Sub_Domain

ggplot(Final_train_data, aes(x=perimeter_worst, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Correlation matrix

# Calculate correlation matrix

df_correlationMatrix <- cor(Final_train_data)

# summarize the correlation matrix

print(df_correlationMatrix)
##                         radius_mean texture_mean perimeter_mean   area_mean
## radius_mean              1.00000000   0.29856413     0.99817835  0.99173530
## texture_mean             0.29856413   1.00000000     0.30626837  0.30701145
## perimeter_mean           0.99817835   0.30626837     1.00000000  0.99015205
## area_mean                0.99173530   0.30701145     0.99015205  1.00000000
## smoothness_mean          0.13376026   0.01609664     0.16644041  0.13640053
## compactness_mean         0.54769702   0.26927860     0.59331539  0.53738059
## concavity_mean           0.71814375   0.32849031     0.75273868  0.72163561
## concave_points_mean      0.82573094   0.29966970     0.85043581  0.82870480
## symmetry_mean            0.15554889   0.13111279     0.18248339  0.16404398
## fractal_dimension_mean  -0.37279201  -0.04865916    -0.33021776 -0.35327903
## radius_se                0.67725904   0.35237177     0.68549184  0.71377702
## texure_se               -0.13701178   0.42803985    -0.13149297 -0.11063807
## perimeter_se             0.66755582   0.36335223     0.68297762  0.70036076
## area_se                  0.80536420   0.34784770     0.81099324  0.84108241
## smoothness_se           -0.33266277   0.06621983    -0.31716814 -0.28714960
## compactness_se           0.25885063   0.27828229     0.29990389  0.25752191
## concavity_se             0.32694032   0.25838760     0.36100574  0.32829398
## concave_points_se        0.47213735   0.22149589     0.49645088  0.46547252
## symmetry_se             -0.20514422   0.05334888    -0.19389509 -0.17716663
## fractal_dimension_se     0.00128676   0.17422342     0.03803676  0.01729457
## radius_worst             0.97376574   0.32550845     0.97455488  0.97406425
## texture_worst            0.28330870   0.90202913     0.29103924  0.28856895
## perimeter_worst          0.96834318   0.33651958     0.97402953  0.96746873
## area_worst               0.95230774   0.32819465     0.95297343  0.96870871
## smoothness_worst         0.10743610   0.09365363     0.13651894  0.11563836
## compactness_worst        0.46727031   0.27447500     0.50668065  0.44898117
## concavity_worst          0.58706785   0.29574520     0.62067646  0.57623805
## concave_points_worst     0.76855143   0.27514011     0.79205244  0.75561850
## symmetry_worst           0.20385254   0.12281898     0.22299758  0.19866426
## fractal_dimension_worst  0.05627572   0.14152280     0.09667917  0.05555742
## diagnosis                0.74832674   0.41974758     0.76302074  0.75127424
##                         smoothness_mean compactness_mean concavity_mean
## radius_mean                  0.13376026       0.54769702    0.718143754
## texture_mean                 0.01609664       0.26927860    0.328490307
## perimeter_mean               0.16644041       0.59331539    0.752738684
## area_mean                    0.13640053       0.53738059    0.721635613
## smoothness_mean              1.00000000       0.61654103    0.479016759
## compactness_mean             0.61654103       1.00000000    0.887872826
## concavity_mean               0.47901676       0.88787283    1.000000000
## concave_points_mean          0.51812644       0.82729890    0.930057825
## symmetry_mean                0.50148833       0.52226317    0.439731159
## fractal_dimension_mean       0.56621360       0.44377231    0.186401719
## radius_se                    0.26259442       0.49384227    0.626276910
## texure_se                    0.10664792      -0.01797803    0.004139446
## perimeter_se                 0.26027918       0.56123077    0.670709175
## area_se                      0.22522831       0.52184522    0.685899256
## smoothness_se                0.33964515       0.03648579   -0.013929824
## compactness_se               0.24985970       0.73965861    0.651563854
## concavity_se                 0.22795699       0.67372541    0.730272740
## concave_points_se            0.38192674       0.64872500    0.677019705
## symmetry_se                  0.10512381       0.03089304   -0.012267449
## fractal_dimension_se         0.29305441       0.53867407    0.419301686
## radius_worst                 0.18101717       0.58130638    0.738278198
## texture_worst                0.07238727       0.27443635    0.328585638
## perimeter_worst              0.20136965       0.63378538    0.776144268
## area_worst                   0.17818463       0.55890080    0.727269163
## smoothness_worst             0.79921862       0.54674159    0.446708473
## compactness_worst            0.39395246       0.86935871    0.774610519
## concavity_worst              0.37140357       0.82984199    0.894080298
## concave_points_worst         0.45500368       0.81543006    0.872795667
## symmetry_worst               0.34213190       0.45516597    0.379008749
## fractal_dimension_worst      0.44875880       0.66738446    0.492555443
## diagnosis                    0.34236224       0.63724181    0.762824154
##                         concave_points_mean symmetry_mean
## radius_mean                      0.82573094     0.1555489
## texture_mean                     0.29966970     0.1311128
## perimeter_mean                   0.85043581     0.1824834
## area_mean                        0.82870480     0.1640440
## smoothness_mean                  0.51812644     0.5014883
## compactness_mean                 0.82729890     0.5222632
## concavity_mean                   0.93005782     0.4397312
## concave_points_mean              1.00000000     0.4212773
## symmetry_mean                    0.42127731     1.0000000
## fractal_dimension_mean           0.06054732     0.3495092
## radius_se                        0.70507350     0.3178219
## texure_se                       -0.01165410     0.1303349
## perimeter_se                     0.72606721     0.3256623
## area_se                          0.77510997     0.2743277
## smoothness_se                   -0.05177695     0.1831325
## compactness_se                   0.48727029     0.3432753
## concavity_se                     0.52849556     0.3013606
## concave_points_se                0.67831316     0.3232982
## symmetry_se                     -0.03456125     0.3076929
## fractal_dimension_se             0.27389979     0.2810543
## radius_worst                     0.83834671     0.2055676
## texture_worst                    0.30117295     0.1560837
## perimeter_worst                  0.85985741     0.2282504
## area_worst                       0.82570144     0.2052586
## smoothness_worst                 0.43401875     0.4004411
## compactness_worst                0.66240146     0.4079059
## concavity_worst                  0.75858220     0.3744382
## concave_points_worst             0.90900531     0.3785374
## symmetry_worst                   0.36001092     0.6697614
## fractal_dimension_worst          0.34934609     0.3675820
## diagnosis                        0.80525149     0.3475161
##                         fractal_dimension_mean   radius_se    texure_se
## radius_mean                        -0.37279201  0.67725904 -0.137011775
## texture_mean                       -0.04865916  0.35237177  0.428039853
## perimeter_mean                     -0.33021776  0.68549184 -0.131492968
## area_mean                          -0.35327903  0.71377702 -0.110638070
## smoothness_mean                     0.56621360  0.26259442  0.106647921
## compactness_mean                    0.44377231  0.49384227 -0.017978031
## concavity_mean                      0.18640172  0.62627691  0.004139446
## concave_points_mean                 0.06054732  0.70507350 -0.011654100
## symmetry_mean                       0.34950922  0.31782187  0.130334923
## fractal_dimension_mean              1.00000000 -0.10122740  0.127282395
## radius_se                          -0.10122740  1.00000000  0.240504653
## texure_se                           0.12728239  0.24050465  1.000000000
## perimeter_se                       -0.05044046  0.96404109  0.255426741
## area_se                            -0.18140107  0.96328091  0.124427757
## smoothness_se                       0.42599486  0.12530663  0.467731449
## compactness_se                      0.42657153  0.37069401  0.186754131
## concavity_se                        0.29251085  0.39367806  0.136010860
## concave_points_se                   0.15762242  0.57187840  0.223141738
## symmetry_se                         0.23283881  0.14974169  0.411281718
## fractal_dimension_se                0.64524127  0.24219863  0.271576115
## radius_worst                       -0.31200858  0.72481796 -0.139816340
## texture_worst                      -0.02616985  0.27763734  0.485285882
## perimeter_worst                    -0.27021279  0.71834787 -0.137716374
## area_worst                         -0.29233858  0.74809351 -0.114746735
## smoothness_worst                    0.50468702  0.13938535 -0.045475869
## compactness_worst                   0.34815821  0.30298133 -0.154398023
## concavity_worst                     0.20832354  0.39850021 -0.144569668
## concave_points_worst                0.06444937  0.53477396 -0.151641575
## symmetry_worst                      0.23301162  0.10630343 -0.158795232
## fractal_dimension_worst             0.71506685  0.05077695 -0.105501248
## diagnosis                          -0.06009576  0.63629440 -0.013588861
##                         perimeter_se     area_se smoothness_se compactness_se
## radius_mean               0.66755582  0.80536420  -0.332662766      0.2588506
## texture_mean              0.36335223  0.34784770   0.066219825      0.2782823
## perimeter_mean            0.68297762  0.81099324  -0.317168143      0.2999039
## area_mean                 0.70036076  0.84108241  -0.287149602      0.2575219
## smoothness_mean           0.26027918  0.22522831   0.339645152      0.2498597
## compactness_mean          0.56123077  0.52184522   0.036485793      0.7396586
## concavity_mean            0.67070918  0.68589926  -0.013929824      0.6515639
## concave_points_mean       0.72606721  0.77510997  -0.051776951      0.4872703
## symmetry_mean             0.32566232  0.27432766   0.183132548      0.3432753
## fractal_dimension_mean   -0.05044046 -0.18140107   0.425994860      0.4265715
## radius_se                 0.96404109  0.96328091   0.125306627      0.3706940
## texure_se                 0.25542674  0.12442776   0.467731449      0.1867541
## perimeter_se              1.00000000  0.93095668   0.123076681      0.4841160
## area_se                   0.93095668  1.00000000  -0.010214549      0.3485652
## smoothness_se             0.12307668 -0.01021455   1.000000000      0.1807416
## compactness_se            0.48411598  0.34856524   0.180741647      1.0000000
## concavity_se              0.48623021  0.39396851   0.129129693      0.8590665
## concave_points_se         0.64396494  0.55222101   0.258892205      0.6937252
## symmetry_se               0.18104921  0.04037709   0.475209628      0.2506622
## fractal_dimension_se      0.31271532  0.18288440   0.381157843      0.7927604
## radius_worst              0.70434288  0.84381849  -0.313820509      0.2648261
## texture_worst             0.28913016  0.29165535  -0.009665971      0.2227798
## perimeter_worst           0.72276195  0.83670895  -0.308243459      0.3241378
## area_worst                0.72310974  0.86982040  -0.270776390      0.2549233
## smoothness_worst          0.12803833  0.14008533   0.361912755      0.1992618
## compactness_worst         0.37989419  0.35887344  -0.132848737      0.7258262
## concavity_worst           0.45832552  0.47380259  -0.150700803      0.6522075
## concave_points_worst      0.57062456  0.62628689  -0.160635476      0.4979262
## symmetry_worst            0.11918570  0.13844031  -0.133034404      0.2146998
## fractal_dimension_worst   0.10134337  0.05955569   0.073112787      0.5817629
## diagnosis                 0.63145566  0.70031135  -0.096387398      0.3329372
##                         concavity_se concave_points_se symmetry_se
## radius_mean                0.3269403        0.47213735 -0.20514422
## texture_mean               0.2583876        0.22149589  0.05334888
## perimeter_mean             0.3610057        0.49645088 -0.19389509
## area_mean                  0.3282940        0.46547252 -0.17716663
## smoothness_mean            0.2279570        0.38192674  0.10512381
## compactness_mean           0.6737254        0.64872500  0.03089304
## concavity_mean             0.7302727        0.67701970 -0.01226745
## concave_points_mean        0.5284956        0.67831316 -0.03456125
## symmetry_mean              0.3013606        0.32329823  0.30769287
## fractal_dimension_mean     0.2925108        0.15762242  0.23283881
## radius_se                  0.3936781        0.57187840  0.14974169
## texure_se                  0.1360109        0.22314174  0.41128172
## perimeter_se               0.4862302        0.64396494  0.18104921
## area_se                    0.3939685        0.55222101  0.04037709
## smoothness_se              0.1291297        0.25889221  0.47520963
## compactness_se             0.8590665        0.69372523  0.25066225
## concavity_se               1.0000000        0.73789343  0.17402301
## concave_points_se          0.7378934        1.00000000  0.22558549
## symmetry_se                0.1740230        0.22558549  1.00000000
## fractal_dimension_se       0.6372759        0.50323067  0.31501285
## radius_worst               0.3233943        0.44433584 -0.22123516
## texture_worst              0.2091099        0.14910835 -0.05210785
## perimeter_worst            0.3730850        0.47603255 -0.21020433
## area_worst                 0.3160164        0.42758720 -0.19798475
## smoothness_worst           0.1904671        0.23363776 -0.09460989
## compactness_worst          0.6246958        0.47689613 -0.12766680
## concavity_worst            0.7400518        0.53897008 -0.16383553
## concave_points_worst       0.5389444        0.65965359 -0.18202915
## symmetry_worst             0.1698343        0.08453104  0.22092402
## fractal_dimension_worst    0.4468984        0.23880144 -0.06890816
## diagnosis                  0.3759069        0.47234209 -0.09437891
##                         fractal_dimension_se radius_worst texture_worst
## radius_mean                       0.00128676   0.97376574   0.283308705
## texture_mean                      0.17422342   0.32550845   0.902029129
## perimeter_mean                    0.03803676   0.97455488   0.291039235
## area_mean                         0.01729457   0.97406425   0.288568951
## smoothness_mean                   0.29305441   0.18101717   0.072387266
## compactness_mean                  0.53867407   0.58130638   0.274436348
## concavity_mean                    0.41930169   0.73827820   0.328585638
## concave_points_mean               0.27389979   0.83834671   0.301172947
## symmetry_mean                     0.28105430   0.20556765   0.156083686
## fractal_dimension_mean            0.64524127  -0.31200858  -0.026169854
## radius_se                         0.24219863   0.72481796   0.277637340
## texure_se                         0.27157611  -0.13981634   0.485285882
## perimeter_se                      0.31271532   0.70434288   0.289130158
## area_se                           0.18288440   0.84381849   0.291655354
## smoothness_se                     0.38115784  -0.31382051  -0.009665971
## compactness_se                    0.79276036   0.26482614   0.222779785
## concavity_se                      0.63727587   0.32339431   0.209109864
## concave_points_se                 0.50323067   0.44433584   0.149108345
## symmetry_se                       0.31501285  -0.22123516  -0.052107850
## fractal_dimension_se              1.00000000   0.01580814   0.108189333
## radius_worst                      0.01580814   1.00000000   0.343374904
## texture_worst                     0.10818933   0.34337490   1.000000000
## perimeter_worst                   0.05806924   0.99316403   0.354852917
## area_worst                        0.02730220   0.99021334   0.342605270
## smoothness_worst                  0.21411449   0.20579053   0.229647855
## compactness_worst                 0.46273053   0.52888749   0.339665011
## concavity_worst                   0.38118907   0.63557524   0.353464226
## concave_points_worst              0.23775943   0.80653679   0.341987017
## symmetry_worst                    0.08686780   0.29254533   0.260773895
## fractal_dimension_worst           0.66578174   0.14583905   0.227927621
## diagnosis                         0.13135370   0.79610486   0.456522368
##                         perimeter_worst area_worst smoothness_worst
## radius_mean                  0.96834318  0.9523077       0.10743610
## texture_mean                 0.33651958  0.3281947       0.09365363
## perimeter_mean               0.97402953  0.9529734       0.13651894
## area_mean                    0.96746873  0.9687087       0.11563836
## smoothness_mean              0.20136965  0.1781846       0.79921862
## compactness_mean             0.63378538  0.5589008       0.54674159
## concavity_mean               0.77614427  0.7272692       0.44670847
## concave_points_mean          0.85985741  0.8257014       0.43401875
## symmetry_mean                0.22825043  0.2052586       0.40044112
## fractal_dimension_mean      -0.27021279 -0.2923386       0.50468702
## radius_se                    0.71834787  0.7480935       0.13938535
## texure_se                   -0.13771637 -0.1147467      -0.04547587
## perimeter_se                 0.72276195  0.7231097       0.12803833
## area_se                      0.83670895  0.8698204       0.14008533
## smoothness_se               -0.30824346 -0.2707764       0.36191275
## compactness_se               0.32413780  0.2549233       0.19926185
## concavity_se                 0.37308505  0.3160164       0.19046709
## concave_points_se            0.47603255  0.4275872       0.23363776
## symmetry_se                 -0.21020433 -0.1979848      -0.09460989
## fractal_dimension_se         0.05806924  0.0273022       0.21411449
## radius_worst                 0.99316403  0.9902133       0.20579053
## texture_worst                0.35485292  0.3426053       0.22964785
## perimeter_worst              1.00000000  0.9816986       0.22521379
## area_worst                   0.98169860  1.0000000       0.20761800
## smoothness_worst             0.22521379  0.2076180       1.00000000
## compactness_worst            0.58626168  0.5016880       0.50787738
## concavity_worst              0.68208152  0.6138833       0.48824322
## concave_points_worst         0.83480860  0.7801895       0.51770861
## symmetry_worst               0.31385489  0.2776926       0.47626752
## fractal_dimension_worst      0.19373637  0.1409382       0.59349636
## diagnosis                    0.80641786  0.7838651       0.40584392
##                         compactness_worst concavity_worst concave_points_worst
## radius_mean                     0.4672703       0.5870679           0.76855143
## texture_mean                    0.2744750       0.2957452           0.27514011
## perimeter_mean                  0.5066806       0.6206765           0.79205244
## area_mean                       0.4489812       0.5762380           0.75561850
## smoothness_mean                 0.3939525       0.3714036           0.45500368
## compactness_mean                0.8693587       0.8298420           0.81543006
## concavity_mean                  0.7746105       0.8940803           0.87279567
## concave_points_mean             0.6624015       0.7585822           0.90900531
## symmetry_mean                   0.4079059       0.3744382           0.37853742
## fractal_dimension_mean          0.3481582       0.2083235           0.06444937
## radius_se                       0.3029813       0.3985002           0.53477396
## texure_se                      -0.1543980      -0.1445697          -0.15164157
## perimeter_se                    0.3798942       0.4583255           0.57062456
## area_se                         0.3588734       0.4738026           0.62628689
## smoothness_se                  -0.1328487      -0.1507008          -0.16063548
## compactness_se                  0.7258262       0.6522075           0.49792619
## concavity_se                    0.6246958       0.7400518           0.53894441
## concave_points_se               0.4768961       0.5389701           0.65965359
## symmetry_se                    -0.1276668      -0.1638355          -0.18202915
## fractal_dimension_se            0.4627305       0.3811891           0.23775943
## radius_worst                    0.5288875       0.6355752           0.80653679
## texture_worst                   0.3396650       0.3534642           0.34198702
## perimeter_worst                 0.5862617       0.6820815           0.83480860
## area_worst                      0.5016880       0.6138833           0.78018948
## smoothness_worst                0.5078774       0.4882432           0.51770861
## compactness_worst               1.0000000       0.9034638           0.79541707
## concavity_worst                 0.9034638       1.0000000           0.85826168
## concave_points_worst            0.7954171       0.8582617           1.00000000
## symmetry_worst                  0.5630911       0.4896662           0.47616247
## fractal_dimension_worst         0.7891382       0.6529336           0.49065397
## diagnosis                       0.5996528       0.6848074           0.79417733
##                         symmetry_worst fractal_dimension_worst   diagnosis
## radius_mean                 0.20385254              0.05627572  0.74832674
## texture_mean                0.12281898              0.14152280  0.41974758
## perimeter_mean              0.22299758              0.09667917  0.76302074
## area_mean                   0.19866426              0.05555742  0.75127424
## smoothness_mean             0.34213190              0.44875880  0.34236224
## compactness_mean            0.45516597              0.66738446  0.63724181
## concavity_mean              0.37900875              0.49255544  0.76282415
## concave_points_mean         0.36001092              0.34934609  0.80525149
## symmetry_mean               0.66976137              0.36758201  0.34751614
## fractal_dimension_mean      0.23301162              0.71506685 -0.06009576
## radius_se                   0.10630343              0.05077695  0.63629440
## texure_se                  -0.15879523             -0.10550125 -0.01358886
## perimeter_se                0.11918570              0.10134337  0.63145566
## area_se                     0.13844031              0.05955569  0.70031135
## smoothness_se              -0.13303440              0.07311279 -0.09638740
## compactness_se              0.21469980              0.58176286  0.33293716
## concavity_se                0.16983426              0.44689842  0.37590686
## concave_points_se           0.08453104              0.23880144  0.47234209
## symmetry_se                 0.22092402             -0.06890816 -0.09437891
## fractal_dimension_se        0.08686780              0.66578174  0.13135370
## radius_worst                0.29254533              0.14583905  0.79610486
## texture_worst               0.26077389              0.22792762  0.45652237
## perimeter_worst             0.31385489              0.19373637  0.80641786
## area_worst                  0.27769261              0.14093816  0.78386515
## smoothness_worst            0.47626752              0.59349636  0.40584392
## compactness_worst           0.56309108              0.78913824  0.59965280
## concavity_worst             0.48966619              0.65293362  0.68480742
## concave_points_worst        0.47616247              0.49065397  0.79417733
## symmetry_worst              1.00000000              0.50360062  0.41106337
## fractal_dimension_worst     0.50360062              1.00000000  0.32916951
## diagnosis                   0.41106337              0.32916951  1.00000000
#Plot correlation matrix
corrplot(df_correlationMatrix, type = "upper",order = "hclust",col=brewer.pal(n=8,name= "RdYlBu"),tl.cex=0.5)

# From the output we can see that columns perimeter_worst, concave_points_mean, concave_poitns_worst,Texture_mean,area_mean,radius_worst,area_worst are highly correlated with diagnosis column

# find attributes that are highly corrected (ideally >0.75)

df_highlyCorrelated <- findCorrelation(df_correlationMatrix, cutoff=0.8,verbose = TRUE)
## Compare row 7  and column  28 with corr  0.873 
##   Means:  0.571 vs 0.407 so flagging column 7 
## Compare row 28  and column  8 with corr  0.909 
##   Means:  0.554 vs 0.396 so flagging column 28 
## Compare row 8  and column  6 with corr  0.827 
##   Means:  0.539 vs 0.386 so flagging column 8 
## Compare row 6  and column  27 with corr  0.83 
##   Means:  0.513 vs 0.375 so flagging column 6 
## Compare row 23  and column  21 with corr  0.993 
##   Means:  0.51 vs 0.364 so flagging column 23 
## Compare row 27  and column  26 with corr  0.903 
##   Means:  0.478 vs 0.354 so flagging column 27 
## Compare row 21  and column  24 with corr  0.99 
##   Means:  0.469 vs 0.343 so flagging column 21 
## Compare row 24  and column  3 with corr  0.953 
##   Means:  0.438 vs 0.333 so flagging column 24 
## Compare row 3  and column  1 with corr  0.998 
##   Means:  0.409 vs 0.325 so flagging column 3 
## Compare row 1  and column  4 with corr  0.992 
##   Means:  0.367 vs 0.318 so flagging column 1 
## Compare row 4  and column  14 with corr  0.841 
##   Means:  0.336 vs 0.314 so flagging column 4 
## Compare row 14  and column  13 with corr  0.931 
##   Means:  0.33 vs 0.311 so flagging column 14 
## Compare row 13  and column  11 with corr  0.964 
##   Means:  0.339 vs 0.312 so flagging column 13 
## Compare row 16  and column  17 with corr  0.859 
##   Means:  0.406 vs 0.306 so flagging column 16 
## Compare row 22  and column  2 with corr  0.902 
##   Means:  0.248 vs 0.301 so flagging column 2 
## All correlations <= 0.8
# print indexes of highly correlated attributes

print(df_highlyCorrelated)
##  [1]  7 28  8  6 23 27 21 24  3  1  4 14 13 16  2
# Plot Diagnosis vs Perimeter_worst
ggplot(Final_train_data, aes(x=perimeter_worst, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot Diagnosis vs concave_points_mean
ggplot(Final_train_data, aes(x=concave_points_mean, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot Diagnosis vs concave_points_worst
ggplot(Final_train_data, aes(x=concave_points_worst, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot Diagnosis vs texture_mean
ggplot(Final_train_data, aes(x=texture_mean, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot Diagnosis vs area_mean
ggplot(Final_train_data, aes(x=area_mean, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Plot Diagnosis vs radius_worst
ggplot(Final_train_data, aes(x=radius_worst, fill=diagnosis)) + geom_histogram() + facet_wrap(~diagnosis)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Converting diagnosis column into factor

Final_train_data$diagnosis <- as.factor(Final_train_data$diagnosis) # Converting the column to a factor variable

df_test$diagnosis <- as.factor(df_test$diagnosis) #Converting the column to a factor variable

Part b - Create a Decision tree using information gain splits

DT <- rpart(diagnosis ~ ., data=Final_train_data,parms = list(split="information") ,method="class")
summary(DT)
## Call:
## rpart(formula = diagnosis ~ ., data = Final_train_data, method = "class", 
##     parms = list(split = "information"))
##   n= 399 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.77272727      0 1.0000000 1.0000000 0.07120036
## 2 0.04924242      1 0.2272727 0.3636364 0.04922818
## 3 0.01515152      3 0.1287879 0.2651515 0.04280794
## 4 0.01000000      4 0.1136364 0.2651515 0.04280794
## 
## Variable importance
##      perimeter_worst           area_worst         radius_worst 
##                   16                   14                   14 
##       perimeter_mean            area_mean          radius_mean 
##                   13                   12                   12 
## concave_points_worst       concavity_mean  concave_points_mean 
##                    4                    3                    2 
##     compactness_mean        texture_worst         texture_mean 
##                    2                    2                    1 
##         perimeter_se    compactness_worst      concavity_worst 
##                    1                    1                    1 
##     smoothness_worst 
##                    1 
## 
## Node number 1: 399 observations,    complexity param=0.7727273
##   predicted class=0  expected loss=0.3308271  P(node) =1
##     class counts:   267   132
##    probabilities: 0.669 0.331 
##   left son=2 (257 obs) right son=3 (142 obs)
##   Primary splits:
##       perimeter_worst      < 107.2    to the left,  improve=153.2803, (0 missing)
##       concave_points_mean  < 0.04892  to the left,  improve=151.1599, (0 missing)
##       radius_worst         < 16.805   to the left,  improve=146.7947, (0 missing)
##       concave_points_worst < 0.14555  to the left,  improve=145.5684, (0 missing)
##       area_worst           < 888.85   to the left,  improve=145.0168, (0 missing)
##   Surrogate splits:
##       radius_worst   < 16.205   to the left,  agree=0.972, adj=0.923, (0 split)
##       area_worst     < 784.15   to the left,  agree=0.972, adj=0.923, (0 split)
##       perimeter_mean < 92.42    to the left,  agree=0.940, adj=0.831, (0 split)
##       area_mean      < 632.8    to the left,  agree=0.935, adj=0.817, (0 split)
##       radius_mean    < 14.15    to the left,  agree=0.932, adj=0.810, (0 split)
## 
## Node number 2: 257 observations,    complexity param=0.01515152
##   predicted class=0  expected loss=0.03891051  P(node) =0.6441103
##     class counts:   247    10
##    probabilities: 0.961 0.039 
##   left son=4 (243 obs) right son=5 (14 obs)
##   Primary splits:
##       concave_points_worst < 0.1352   to the left,  improve=21.11549, (0 missing)
##       smoothness_worst     < 0.17725  to the left,  improve=17.27677, (0 missing)
##       concavity_mean       < 0.09752  to the left,  improve=16.77982, (0 missing)
##       concave_points_mean  < 0.05583  to the left,  improve=16.39897, (0 missing)
##       concavity_worst      < 0.3967   to the left,  improve=16.39897, (0 missing)
##   Surrogate splits:
##       compactness_mean  < 0.1338   to the left,  agree=0.969, adj=0.429, (0 split)
##       concavity_mean    < 0.11265  to the left,  agree=0.969, adj=0.429, (0 split)
##       smoothness_worst  < 0.17725  to the left,  agree=0.969, adj=0.429, (0 split)
##       compactness_worst < 0.3932   to the left,  agree=0.969, adj=0.429, (0 split)
##       concavity_worst   < 0.3967   to the left,  agree=0.969, adj=0.429, (0 split)
## 
## Node number 3: 142 observations,    complexity param=0.04924242
##   predicted class=1  expected loss=0.1408451  P(node) =0.3558897
##     class counts:    20   122
##    probabilities: 0.141 0.859 
##   left son=6 (49 obs) right son=7 (93 obs)
##   Primary splits:
##       concave_points_mean  < 0.063655 to the left,  improve=24.58926, (0 missing)
##       texture_mean         < 16.795   to the left,  improve=24.33156, (0 missing)
##       texture_worst        < 21.745   to the left,  improve=24.33156, (0 missing)
##       perimeter_worst      < 116.05   to the left,  improve=21.55216, (0 missing)
##       concave_points_worst < 0.14905  to the left,  improve=21.50773, (0 missing)
##   Surrogate splits:
##       concavity_mean       < 0.10715  to the left,  agree=0.894, adj=0.694, (0 split)
##       concave_points_worst < 0.1463   to the left,  agree=0.894, adj=0.694, (0 split)
##       perimeter_worst      < 118.25   to the left,  agree=0.831, adj=0.510, (0 split)
##       compactness_mean     < 0.1027   to the left,  agree=0.810, adj=0.449, (0 split)
##       perimeter_se         < 2.87     to the left,  agree=0.803, adj=0.429, (0 split)
## 
## Node number 4: 243 observations
##   predicted class=0  expected loss=0.008230453  P(node) =0.6090226
##     class counts:   241     2
##    probabilities: 0.992 0.008 
## 
## Node number 5: 14 observations
##   predicted class=1  expected loss=0.4285714  P(node) =0.03508772
##     class counts:     6     8
##    probabilities: 0.429 0.571 
## 
## Node number 6: 49 observations,    complexity param=0.04924242
##   predicted class=1  expected loss=0.4081633  P(node) =0.122807
##     class counts:    20    29
##    probabilities: 0.408 0.592 
##   left son=12 (13 obs) right son=13 (36 obs)
##   Primary splits:
##       texture_worst   < 20.045   to the left,  improve=15.399240, (0 missing)
##       texture_mean    < 15.745   to the left,  improve=13.816120, (0 missing)
##       area_worst      < 957.45   to the left,  improve= 7.383271, (0 missing)
##       perimeter_worst < 128.05   to the left,  improve= 6.846033, (0 missing)
##       symmetry_worst  < 0.31965  to the left,  improve= 6.846033, (0 missing)
##   Surrogate splits:
##       texture_mean     < 15.745   to the left,  agree=0.980, adj=0.923, (0 split)
##       radius_se        < 0.2474   to the left,  agree=0.796, adj=0.231, (0 split)
##       texure_se        < 0.47315  to the left,  agree=0.796, adj=0.231, (0 split)
##       area_se          < 22.47    to the left,  agree=0.796, adj=0.231, (0 split)
##       compactness_mean < 0.1437   to the right, agree=0.776, adj=0.154, (0 split)
## 
## Node number 7: 93 observations
##   predicted class=1  expected loss=0  P(node) =0.2330827
##     class counts:     0    93
##    probabilities: 0.000 1.000 
## 
## Node number 12: 13 observations
##   predicted class=0  expected loss=0  P(node) =0.03258145
##     class counts:    13     0
##    probabilities: 1.000 0.000 
## 
## Node number 13: 36 observations
##   predicted class=1  expected loss=0.1944444  P(node) =0.09022556
##     class counts:     7    29
##    probabilities: 0.194 0.806
# Plotting decision tree using rpart.plot()
rpart.plot(DT, main="Decision Tree for medical diagnoses")

 plotcp(DT)

 # Insights : There are 5 leaf nodes in this decison tree

Part c - Major predictors

–Major predictors suggested by the tree are Perimeter_worst, concave_points_mean, Conace_poins_worst,Texure_mean. These are the major predictors because we are getting maximum information gain from each split

–Yes,Predictors from the Decision tree is same as the predictors we got from correlation matrix

Part d - Highest probabilty of cancer

– If the perimeter_worst is less than 107 and concave_points_mean greater than 0.064, then the person is likely to have cancerous tissue and the probability in this case is 100% – If the perimeter perimeter_worst is less than 107 and concave_points_mean is less than 0.064 and texture_worst is less than 20 then the person having cancerous tissue has a probability of 81%

Part e - Accuracy of the decision tree model

# Predicting the model on train data
predict_train <-predict(DT, Final_train_data, type = 'class')
table_train <- table(Final_train_data$diagnosis, predict_train)
table_train
##    predict_train
##       0   1
##   0 254  13
##   1   2 130
# Predicting the model on test data
predict_test <-predict(DT, df_test, type = 'class')
table_test <- table(df_test$diagnosis, predict_test)
table_test
##    predict_test
##      0  1
##   0 28  4
##   1  2 23
# Accuracy of the model on train data
accuracy_Train <- sum(diag(table_train)) / sum(table_train)
print(paste('Accuracy for train', accuracy_Train))
## [1] "Accuracy for train 0.962406015037594"
# Findings : Accuracy for the train data is 96.2%

# Accuracy of the model on test data
accuracy_Test <- sum(diag(table_test)) / sum(table_test)
print(paste('Accuracy for test', accuracy_Test))
## [1] "Accuracy for test 0.894736842105263"
#Accuracy for test data is 89.4%

Part f - Constructing the best possible decision tree

# Bulding a new decision tree to improve the accuracy

DT1 <- rpart(diagnosis ~ ., data=Final_train_data,parms = list(split="information") ,method="class",
             control = rpart.control( minsplit = 10, minbucket = 5, cp = 0.01))

# Summary of decision tree
summary(DT1)
## Call:
## rpart(formula = diagnosis ~ ., data = Final_train_data, method = "class", 
##     parms = list(split = "information"), control = rpart.control(minsplit = 10, 
##         minbucket = 5, cp = 0.01))
##   n= 399 
## 
##           CP nsplit  rel error    xerror       xstd
## 1 0.77272727      0 1.00000000 1.0000000 0.07120036
## 2 0.04924242      1 0.22727273 0.3787879 0.05009996
## 3 0.02651515      3 0.12878788 0.2272727 0.03990372
## 4 0.01893939      5 0.07575758 0.1893939 0.03667291
## 5 0.01000000      7 0.03787879 0.1893939 0.03667291
## 
## Variable importance
##      perimeter_worst         radius_worst           area_worst 
##                   16                   14                   13 
##       perimeter_mean            area_mean          radius_mean 
##                   12                   12                   12 
## concave_points_worst  concave_points_mean       concavity_mean 
##                    4                    3                    2 
##     compactness_mean        texture_worst         texture_mean 
##                    2                    2                    2 
##     smoothness_worst         perimeter_se    compactness_worst 
##                    1                    1                    1 
##      concavity_worst            texure_se 
##                    1                    1 
## 
## Node number 1: 399 observations,    complexity param=0.7727273
##   predicted class=0  expected loss=0.3308271  P(node) =1
##     class counts:   267   132
##    probabilities: 0.669 0.331 
##   left son=2 (257 obs) right son=3 (142 obs)
##   Primary splits:
##       perimeter_worst      < 107.2     to the left,  improve=153.2803, (0 missing)
##       concave_points_mean  < 0.04892   to the left,  improve=151.1599, (0 missing)
##       radius_worst         < 16.805    to the left,  improve=146.7947, (0 missing)
##       concave_points_worst < 0.14555   to the left,  improve=145.5684, (0 missing)
##       area_worst           < 888.85    to the left,  improve=145.0168, (0 missing)
##   Surrogate splits:
##       radius_worst   < 16.205    to the left,  agree=0.972, adj=0.923, (0 split)
##       area_worst     < 784.15    to the left,  agree=0.972, adj=0.923, (0 split)
##       perimeter_mean < 92.42     to the left,  agree=0.940, adj=0.831, (0 split)
##       area_mean      < 632.8     to the left,  agree=0.935, adj=0.817, (0 split)
##       radius_mean    < 14.15     to the left,  agree=0.932, adj=0.810, (0 split)
## 
## Node number 2: 257 observations,    complexity param=0.02651515
##   predicted class=0  expected loss=0.03891051  P(node) =0.6441103
##     class counts:   247    10
##    probabilities: 0.961 0.039 
##   left son=4 (243 obs) right son=5 (14 obs)
##   Primary splits:
##       concave_points_worst < 0.1352    to the left,  improve=21.11549, (0 missing)
##       smoothness_worst     < 0.17725   to the left,  improve=17.27677, (0 missing)
##       concavity_mean       < 0.09752   to the left,  improve=16.77982, (0 missing)
##       concave_points_mean  < 0.05583   to the left,  improve=16.39897, (0 missing)
##       concavity_worst      < 0.3967    to the left,  improve=16.39897, (0 missing)
##   Surrogate splits:
##       compactness_mean  < 0.1338    to the left,  agree=0.969, adj=0.429, (0 split)
##       concavity_mean    < 0.11265   to the left,  agree=0.969, adj=0.429, (0 split)
##       smoothness_worst  < 0.17725   to the left,  agree=0.969, adj=0.429, (0 split)
##       compactness_worst < 0.3932    to the left,  agree=0.969, adj=0.429, (0 split)
##       concavity_worst   < 0.3967    to the left,  agree=0.969, adj=0.429, (0 split)
## 
## Node number 3: 142 observations,    complexity param=0.04924242
##   predicted class=1  expected loss=0.1408451  P(node) =0.3558897
##     class counts:    20   122
##    probabilities: 0.141 0.859 
##   left son=6 (49 obs) right son=7 (93 obs)
##   Primary splits:
##       concave_points_mean  < 0.063655  to the left,  improve=24.58926, (0 missing)
##       texture_mean         < 16.795    to the left,  improve=24.33156, (0 missing)
##       texture_worst        < 21.745    to the left,  improve=24.33156, (0 missing)
##       perimeter_worst      < 116.05    to the left,  improve=21.55216, (0 missing)
##       concave_points_worst < 0.14905   to the left,  improve=21.50773, (0 missing)
##   Surrogate splits:
##       concavity_mean       < 0.10715   to the left,  agree=0.894, adj=0.694, (0 split)
##       concave_points_worst < 0.1463    to the left,  agree=0.894, adj=0.694, (0 split)
##       perimeter_worst      < 118.25    to the left,  agree=0.831, adj=0.510, (0 split)
##       compactness_mean     < 0.1027    to the left,  agree=0.810, adj=0.449, (0 split)
##       perimeter_se         < 2.87      to the left,  agree=0.803, adj=0.429, (0 split)
## 
## Node number 4: 243 observations
##   predicted class=0  expected loss=0.008230453  P(node) =0.6090226
##     class counts:   241     2
##    probabilities: 0.992 0.008 
## 
## Node number 5: 14 observations,    complexity param=0.02651515
##   predicted class=1  expected loss=0.4285714  P(node) =0.03508772
##     class counts:     6     8
##    probabilities: 0.429 0.571 
##   left son=10 (7 obs) right son=11 (7 obs)
##   Primary splits:
##       texture_worst           < 26.9      to the left,  improve=6.689899, (0 missing)
##       texture_mean            < 20.05     to the left,  improve=3.832086, (0 missing)
##       symmetry_mean           < 0.2044    to the left,  improve=3.832086, (0 missing)
##       smoothness_worst        < 0.17825   to the left,  improve=3.832086, (0 missing)
##       fractal_dimension_worst < 0.11785   to the left,  improve=3.832086, (0 missing)
##   Surrogate splits:
##       texture_mean        < 18.42     to the left,  agree=0.857, adj=0.714, (0 split)
##       symmetry_mean       < 0.20965   to the left,  agree=0.786, adj=0.571, (0 split)
##       texure_se           < 1.0685    to the left,  agree=0.786, adj=0.571, (0 split)
##       symmetry_worst      < 0.2679    to the left,  agree=0.786, adj=0.571, (0 split)
##       concave_points_mean < 0.04271   to the left,  agree=0.714, adj=0.429, (0 split)
## 
## Node number 6: 49 observations,    complexity param=0.04924242
##   predicted class=1  expected loss=0.4081633  P(node) =0.122807
##     class counts:    20    29
##    probabilities: 0.408 0.592 
##   left son=12 (13 obs) right son=13 (36 obs)
##   Primary splits:
##       texture_worst   < 20.045    to the left,  improve=15.399240, (0 missing)
##       texture_mean    < 15.745    to the left,  improve=13.816120, (0 missing)
##       area_worst      < 957.45    to the left,  improve= 7.383271, (0 missing)
##       perimeter_worst < 128.05    to the left,  improve= 6.846033, (0 missing)
##       symmetry_worst  < 0.31965   to the left,  improve= 6.846033, (0 missing)
##   Surrogate splits:
##       texture_mean     < 15.745    to the left,  agree=0.980, adj=0.923, (0 split)
##       radius_se        < 0.2474    to the left,  agree=0.796, adj=0.231, (0 split)
##       texure_se        < 0.47315   to the left,  agree=0.796, adj=0.231, (0 split)
##       area_se          < 22.47     to the left,  agree=0.796, adj=0.231, (0 split)
##       compactness_mean < 0.1437    to the right, agree=0.776, adj=0.154, (0 split)
## 
## Node number 7: 93 observations
##   predicted class=1  expected loss=0  P(node) =0.2330827
##     class counts:     0    93
##    probabilities: 0.000 1.000 
## 
## Node number 10: 7 observations
##   predicted class=0  expected loss=0.1428571  P(node) =0.01754386
##     class counts:     6     1
##    probabilities: 0.857 0.143 
## 
## Node number 11: 7 observations
##   predicted class=1  expected loss=0  P(node) =0.01754386
##     class counts:     0     7
##    probabilities: 0.000 1.000 
## 
## Node number 12: 13 observations
##   predicted class=0  expected loss=0  P(node) =0.03258145
##     class counts:    13     0
##    probabilities: 1.000 0.000 
## 
## Node number 13: 36 observations,    complexity param=0.01893939
##   predicted class=1  expected loss=0.1944444  P(node) =0.09022556
##     class counts:     7    29
##    probabilities: 0.194 0.806 
##   left son=26 (12 obs) right son=27 (24 obs)
##   Primary splits:
##       radius_worst        < 16.8      to the left,  improve=5.259041, (0 missing)
##       concave_points_se   < 0.0099805 to the right, improve=4.784799, (0 missing)
##       area_worst          < 871.8     to the left,  improve=4.647885, (0 missing)
##       smoothness_worst    < 0.13645   to the left,  improve=3.972922, (0 missing)
##       concave_points_mean < 0.048785  to the left,  improve=3.618231, (0 missing)
##   Surrogate splits:
##       area_worst      < 871.8     to the left,  agree=0.972, adj=0.917, (0 split)
##       radius_mean     < 15.045    to the left,  agree=0.889, adj=0.667, (0 split)
##       area_mean       < 697.8     to the left,  agree=0.889, adj=0.667, (0 split)
##       perimeter_worst < 111.7     to the left,  agree=0.861, adj=0.583, (0 split)
##       perimeter_mean  < 94.485    to the left,  agree=0.806, adj=0.417, (0 split)
## 
## Node number 26: 12 observations,    complexity param=0.01893939
##   predicted class=0  expected loss=0.5  P(node) =0.03007519
##     class counts:     6     6
##    probabilities: 0.500 0.500 
##   left son=52 (5 obs) right son=53 (7 obs)
##   Primary splits:
##       concave_points_mean < 0.048785  to the left,  improve=5.446952, (0 missing)
##       smoothness_worst    < 0.13755   to the left,  improve=5.446952, (0 missing)
##       smoothness_mean     < 0.097515  to the left,  improve=2.911032, (0 missing)
##       smoothness_se       < 0.0053495 to the left,  improve=2.911032, (0 missing)
##       radius_mean         < 14.335    to the right, improve=1.627867, (0 missing)
##   Surrogate splits:
##       smoothness_mean  < 0.09218   to the left,  agree=0.917, adj=0.8, (0 split)
##       compactness_se   < 0.03024   to the right, agree=0.833, adj=0.6, (0 split)
##       smoothness_worst < 0.13755   to the left,  agree=0.833, adj=0.6, (0 split)
##       texture_mean     < 21.915    to the right, agree=0.750, adj=0.4, (0 split)
##       perimeter_mean   < 95.145    to the right, agree=0.750, adj=0.4, (0 split)
## 
## Node number 27: 24 observations
##   predicted class=1  expected loss=0.04166667  P(node) =0.06015038
##     class counts:     1    23
##    probabilities: 0.042 0.958 
## 
## Node number 52: 5 observations
##   predicted class=0  expected loss=0  P(node) =0.01253133
##     class counts:     5     0
##    probabilities: 1.000 0.000 
## 
## Node number 53: 7 observations
##   predicted class=1  expected loss=0.1428571  P(node) =0.01754386
##     class counts:     1     6
##    probabilities: 0.143 0.857
# Predicting the model on train data
DT1_train <- table(pred=predict(DT1,Final_train_data, type="class"), true=Final_train_data$diagnosis)


# Predicting the model on test data
DT1_test <- table(pred=predict(DT1,df_test, type="class"), true=df_test$diagnosis)


# Accuracy of train data

accuracy_Train_DT1 <- sum(diag(DT1_train)) / sum(DT1_train)
accuracy_Train_DT1
## [1] 0.9874687
# Insights : Accuracy of train data is 98.74%

# Accuracy of test data

accuracy_Test_DT1 <- sum(diag(DT1_test)) / sum(DT1_test)
accuracy_Test_DT1
## [1] 0.9473684
# Insights : Accuracy of test data is 94.73%

# Confusion matrix
confusionMatrix(DT1_train,reference = Final_train_data$diagnosis)
## Confusion Matrix and Statistics
## 
##     true
## pred   0   1
##    0 265   3
##    1   2 129
##                                          
##                Accuracy : 0.9875         
##                  95% CI : (0.971, 0.9959)
##     No Information Rate : 0.6692         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9716         
##                                          
##  Mcnemar's Test P-Value : 1              
##                                          
##             Sensitivity : 0.9925         
##             Specificity : 0.9773         
##          Pos Pred Value : 0.9888         
##          Neg Pred Value : 0.9847         
##              Prevalence : 0.6692         
##          Detection Rate : 0.6642         
##    Detection Prevalence : 0.6717         
##       Balanced Accuracy : 0.9849         
##                                          
##        'Positive' Class : 0              
## 
#Insights : 1. Accuracy : 98.75%
#           2.Sensitivity : 0.984       
#           3.Specificity : 0.984


# ROC curve for new model DT1
scoreTst <- predict(DT1, df_test, type="prob")[, 2]
scoreTst
##           1           2           3           4           5           6 
## 0.008230453 0.008230453 0.008230453 1.000000000 1.000000000 1.000000000 
##           7           8           9          10          11          12 
## 0.008230453 0.958333333 0.008230453 0.008230453 0.000000000 1.000000000 
##          13          14          15          16          17          18 
## 0.008230453 0.008230453 1.000000000 0.000000000 0.008230453 0.008230453 
##          19          20          21          22          23          24 
## 0.958333333 0.142857143 1.000000000 0.008230453 0.008230453 0.008230453 
##          25          26          27          28          29          30 
## 0.008230453 0.008230453 0.000000000 0.008230453 1.000000000 1.000000000 
##          31          32          33          34          35          36 
## 0.008230453 0.008230453 0.008230453 1.000000000 0.958333333 1.000000000 
##          37          38          39          40          41          42 
## 0.008230453 0.008230453 0.008230453 0.008230453 1.000000000 1.000000000 
##          43          44          45          46          47          48 
## 1.000000000 1.000000000 0.008230453 0.008230453 1.000000000 1.000000000 
##          49          50          51          52          53          54 
## 1.000000000 1.000000000 0.008230453 0.008230453 0.008230453 1.000000000 
##          55          56          57 
## 0.958333333 1.000000000 0.008230453
#now apply the prediction function from ROCR to get a prediction object
rocPredTst <- prediction(scoreTst, df_test$diagnosis, label.ordering = c('0', '1')) 

#obtain performance using the function from ROCR, then plot
perfROCTst<-ROCR::performance(rocPredTst,"tpr","fpr")

# ROC curve for a initial model DT
scoreTst_DT <- predict(DT, df_test, type="prob")[,2]
rocPredTst_DT <- prediction(scoreTst_DT, df_test$diagnosis,label.ordering = c('0', '1'))
perfROCTst_DT <- ROCR::performance(rocPredTst_DT, "tpr", "fpr")
plot(perfROCTst)
plot(perfROCTst_DT, add=TRUE, col="blue")

# AUC value for Final model DT1
aucPerf_final=ROCR::performance(rocPredTst, "auc")
aucPerf_final@y.values
## [[1]]
## [1] 0.944375
#Findings : AUC value for DT1 is 0.944

# AUC value for initial model DT
aucPerf_initial=ROCR::performance(rocPredTst_DT, "auc")
aucPerf_initial@y.values
## [[1]]
## [1] 0.926875
# Findings : AUC value for DT is 0.9268

Part g - Plot your final decision tree model and write down all decision rules

rpart.plot(DT1, main="Final Decision Tree for medical diagnoses")

#Decision rules : 
#1. Diagnosis is the factor variable
#2. We are splitting it on information 
#3. We are using a minimum split of 10
#4. We are using minimum bucket of 5
#5. We are building a decision tree with complexity parameter(cp) 0.01

Problem 5 - Build Machine learning model to detect phishing attacks

Importing the Data

df_phi <-read.csv("Training Dataset.arff", header = FALSE, comment.char = "@")

Renaming the column names

names(df_phi) <- c('having_IP_Address', 'URL_Length', 'Shortining_Service', 'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix', 'having_Sub_Domain', 'SSLfinal_State', 'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH', 'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover', 'RightClick', 'popUpWidnow', 'Iframe', 'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank', 'Google_Index', 'Links_pointing_to_page', 'Statistical_report', 'Result')

head(df_phi)

#If we have only -1 and 1 in a column then -1 = phishing and 1 = legitimate #If we have -1,0,1 values in a column then -1=legitimate, 0=suspicious, 1= phishing

Exploratory data analysis

#structure of dataframe

str(df_phi)
## 'data.frame':    11055 obs. of  31 variables:
##  $ having_IP_Address          : int  -1 1 1 1 1 -1 1 1 1 1 ...
##  $ URL_Length                 : int  1 1 0 0 0 0 0 0 0 1 ...
##  $ Shortining_Service         : int  1 1 1 1 -1 -1 -1 1 -1 -1 ...
##  $ having_At_Symbol           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ double_slash_redirecting   : int  -1 1 1 1 1 -1 1 1 1 1 ...
##  $ Prefix_Suffix              : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ having_Sub_Domain          : int  -1 0 -1 -1 1 1 -1 -1 1 -1 ...
##  $ SSLfinal_State             : int  -1 1 -1 -1 1 1 -1 -1 1 1 ...
##  $ Domain_registeration_length: int  -1 -1 -1 1 -1 -1 1 1 -1 -1 ...
##  $ Favicon                    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ port                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ HTTPS_token                : int  -1 -1 -1 -1 1 -1 1 -1 -1 1 ...
##  $ Request_URL                : int  1 1 1 -1 1 1 -1 -1 1 1 ...
##  $ URL_of_Anchor              : int  -1 0 0 0 0 0 -1 0 0 0 ...
##  $ Links_in_tags              : int  1 -1 -1 0 0 0 0 -1 1 1 ...
##  $ SFH                        : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ Submitting_to_email        : int  -1 1 -1 1 1 -1 -1 1 1 1 ...
##  $ Abnormal_URL               : int  -1 1 -1 1 1 -1 -1 1 1 1 ...
##  $ Redirect                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ on_mouseover               : int  1 1 1 1 -1 1 1 1 1 1 ...
##  $ RightClick                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ popUpWidnow                : int  1 1 1 1 -1 1 1 1 1 1 ...
##  $ Iframe                     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ age_of_domain              : int  -1 -1 1 -1 -1 1 1 -1 1 1 ...
##  $ DNSRecord                  : int  -1 -1 -1 -1 -1 1 -1 -1 -1 -1 ...
##  $ web_traffic                : int  -1 0 1 1 0 1 -1 0 1 0 ...
##  $ Page_Rank                  : int  -1 -1 -1 -1 -1 -1 -1 -1 1 -1 ...
##  $ Google_Index               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Links_pointing_to_page     : int  1 1 0 -1 1 -1 0 0 0 0 ...
##  $ Statistical_report         : int  -1 1 -1 1 1 -1 -1 1 1 1 ...
##  $ Result                     : int  -1 -1 -1 -1 1 1 -1 -1 1 -1 ...
#Findings : There are 31 fields with 11055 rows in train dataset

#summary
summary(df_phi) 
##  having_IP_Address   URL_Length      Shortining_Service having_At_Symbol 
##  Min.   :-1.0000   Min.   :-1.0000   Min.   :-1.0000    Min.   :-1.0000  
##  1st Qu.:-1.0000   1st Qu.:-1.0000   1st Qu.: 1.0000    1st Qu.: 1.0000  
##  Median : 1.0000   Median :-1.0000   Median : 1.0000    Median : 1.0000  
##  Mean   : 0.3138   Mean   :-0.6332   Mean   : 0.7388    Mean   : 0.7006  
##  3rd Qu.: 1.0000   3rd Qu.:-1.0000   3rd Qu.: 1.0000    3rd Qu.: 1.0000  
##  Max.   : 1.0000   Max.   : 1.0000   Max.   : 1.0000    Max.   : 1.0000  
##  double_slash_redirecting Prefix_Suffix    having_Sub_Domain  SSLfinal_State   
##  Min.   :-1.0000          Min.   :-1.000   Min.   :-1.00000   Min.   :-1.0000  
##  1st Qu.: 1.0000          1st Qu.:-1.000   1st Qu.:-1.00000   1st Qu.:-1.0000  
##  Median : 1.0000          Median :-1.000   Median : 0.00000   Median : 1.0000  
##  Mean   : 0.7415          Mean   :-0.735   Mean   : 0.06395   Mean   : 0.2509  
##  3rd Qu.: 1.0000          3rd Qu.:-1.000   3rd Qu.: 1.00000   3rd Qu.: 1.0000  
##  Max.   : 1.0000          Max.   : 1.000   Max.   : 1.00000   Max.   : 1.0000  
##  Domain_registeration_length    Favicon             port        
##  Min.   :-1.0000             Min.   :-1.0000   Min.   :-1.0000  
##  1st Qu.:-1.0000             1st Qu.: 1.0000   1st Qu.: 1.0000  
##  Median :-1.0000             Median : 1.0000   Median : 1.0000  
##  Mean   :-0.3368             Mean   : 0.6286   Mean   : 0.7283  
##  3rd Qu.: 1.0000             3rd Qu.: 1.0000   3rd Qu.: 1.0000  
##  Max.   : 1.0000             Max.   : 1.0000   Max.   : 1.0000  
##   HTTPS_token       Request_URL      URL_of_Anchor      Links_in_tags    
##  Min.   :-1.0000   Min.   :-1.0000   Min.   :-1.00000   Min.   :-1.0000  
##  1st Qu.: 1.0000   1st Qu.:-1.0000   1st Qu.:-1.00000   1st Qu.:-1.0000  
##  Median : 1.0000   Median : 1.0000   Median : 0.00000   Median : 0.0000  
##  Mean   : 0.6751   Mean   : 0.1868   Mean   :-0.07653   Mean   :-0.1181  
##  3rd Qu.: 1.0000   3rd Qu.: 1.0000   3rd Qu.: 0.00000   3rd Qu.: 0.0000  
##  Max.   : 1.0000   Max.   : 1.0000   Max.   : 1.00000   Max.   : 1.0000  
##       SFH          Submitting_to_email  Abnormal_URL        Redirect     
##  Min.   :-1.0000   Min.   :-1.0000     Min.   :-1.0000   Min.   :0.0000  
##  1st Qu.:-1.0000   1st Qu.: 1.0000     1st Qu.: 1.0000   1st Qu.:0.0000  
##  Median :-1.0000   Median : 1.0000     Median : 1.0000   Median :0.0000  
##  Mean   :-0.5957   Mean   : 0.6356     Mean   : 0.7053   Mean   :0.1157  
##  3rd Qu.:-1.0000   3rd Qu.: 1.0000     3rd Qu.: 1.0000   3rd Qu.:0.0000  
##  Max.   : 1.0000   Max.   : 1.0000     Max.   : 1.0000   Max.   :1.0000  
##   on_mouseover       RightClick       popUpWidnow          Iframe       
##  Min.   :-1.0000   Min.   :-1.0000   Min.   :-1.0000   Min.   :-1.0000  
##  1st Qu.: 1.0000   1st Qu.: 1.0000   1st Qu.: 1.0000   1st Qu.: 1.0000  
##  Median : 1.0000   Median : 1.0000   Median : 1.0000   Median : 1.0000  
##  Mean   : 0.7621   Mean   : 0.9139   Mean   : 0.6134   Mean   : 0.8169  
##  3rd Qu.: 1.0000   3rd Qu.: 1.0000   3rd Qu.: 1.0000   3rd Qu.: 1.0000  
##  Max.   : 1.0000   Max.   : 1.0000   Max.   : 1.0000   Max.   : 1.0000  
##  age_of_domain        DNSRecord        web_traffic        Page_Rank      
##  Min.   :-1.00000   Min.   :-1.0000   Min.   :-1.0000   Min.   :-1.0000  
##  1st Qu.:-1.00000   1st Qu.:-1.0000   1st Qu.: 0.0000   1st Qu.:-1.0000  
##  Median : 1.00000   Median : 1.0000   Median : 1.0000   Median :-1.0000  
##  Mean   : 0.06124   Mean   : 0.3771   Mean   : 0.2873   Mean   :-0.4837  
##  3rd Qu.: 1.00000   3rd Qu.: 1.0000   3rd Qu.: 1.0000   3rd Qu.: 1.0000  
##  Max.   : 1.00000   Max.   : 1.0000   Max.   : 1.0000   Max.   : 1.0000  
##   Google_Index     Links_pointing_to_page Statistical_report     Result       
##  Min.   :-1.0000   Min.   :-1.000         Min.   :-1.0000    Min.   :-1.0000  
##  1st Qu.: 1.0000   1st Qu.: 0.000         1st Qu.: 1.0000    1st Qu.:-1.0000  
##  Median : 1.0000   Median : 0.000         Median : 1.0000    Median : 1.0000  
##  Mean   : 0.7216   Mean   : 0.344         Mean   : 0.7196    Mean   : 0.1139  
##  3rd Qu.: 1.0000   3rd Qu.: 1.000         3rd Qu.: 1.0000    3rd Qu.: 1.0000  
##  Max.   : 1.0000   Max.   : 1.000         Max.   : 1.0000    Max.   : 1.0000
# Checking for missing values

colSums(is.na(df_phi))
##           having_IP_Address                  URL_Length 
##                           0                           0 
##          Shortining_Service            having_At_Symbol 
##                           0                           0 
##    double_slash_redirecting               Prefix_Suffix 
##                           0                           0 
##           having_Sub_Domain              SSLfinal_State 
##                           0                           0 
## Domain_registeration_length                     Favicon 
##                           0                           0 
##                        port                 HTTPS_token 
##                           0                           0 
##                 Request_URL               URL_of_Anchor 
##                           0                           0 
##               Links_in_tags                         SFH 
##                           0                           0 
##         Submitting_to_email                Abnormal_URL 
##                           0                           0 
##                    Redirect                on_mouseover 
##                           0                           0 
##                  RightClick                 popUpWidnow 
##                           0                           0 
##                      Iframe               age_of_domain 
##                           0                           0 
##                   DNSRecord                 web_traffic 
##                           0                           0 
##                   Page_Rank                Google_Index 
##                           0                           0 
##      Links_pointing_to_page          Statistical_report 
##                           0                           0 
##                      Result 
##                           0
# There are no missing values in the dataset

# Detecting outliers in the dataset
# Plotting Boxplot to find the outliers

boxplot(df_phi,main = "Outlier detection of all columns using box plot")

# Insights : As the data is either 0,1,-1 there are no outliers due to this we are not performing outlier treatment

# calculate correlation matrix

correlationMatrix <- cor(df_phi)

# summarize the correlation matrix

print(correlationMatrix)
##                             having_IP_Address    URL_Length Shortining_Service
## having_IP_Address                 1.000000000 -0.0524107388       0.4034610930
## URL_Length                       -0.052410739  1.0000000000      -0.0978809097
## Shortining_Service                0.403461093 -0.0978809097       1.0000000000
## having_At_Symbol                  0.158698951 -0.0751084756       0.1044465494
## double_slash_redirecting          0.397389087 -0.0812470792       0.8427956224
## Prefix_Suffix                    -0.005256975  0.0552467160      -0.0804705533
## having_Sub_Domain                -0.080744639  0.0039968791      -0.0419161818
## SSLfinal_State                    0.071414500  0.0487537328      -0.0614256521
## Domain_registeration_length      -0.022739206 -0.2218924309       0.0609231944
## Favicon                           0.087024829 -0.0424972200       0.0061006489
## port                              0.060979206  0.0003229483       0.0022008313
## HTTPS_token                       0.363534482 -0.0893825485       0.7578377012
## Request_URL                       0.029772867  0.2463480961      -0.0372345268
## URL_of_Anchor                     0.099846955 -0.0233955096       0.0005614793
## Links_in_tags                     0.006212404  0.0528690722      -0.1333791164
## SFH                              -0.010962287  0.4141962193      -0.0227234768
## Submitting_to_email               0.077989153 -0.0144574973       0.0493282274
## Abnormal_URL                      0.336549357 -0.1067608602       0.7392895721
## Redirect                         -0.321181419  0.0468322384      -0.5345296596
## on_mouseover                      0.084059316 -0.0451030375       0.0623834682
## RightClick                        0.042881431 -0.0136133645       0.0381184510
## popUpWidnow                       0.096882295 -0.0493812351       0.0366157897
## Iframe                            0.054694437 -0.0138382241       0.0165807574
## age_of_domain                    -0.010445721  0.1794264244      -0.0525958372
## DNSRecord                        -0.050733303 -0.0408233717       0.4360642903
## web_traffic                       0.002922205  0.0089927816      -0.0470743710
## Page_Rank                        -0.091773751  0.1835179731       0.0145913709
## Google_Index                      0.029152889  0.0029024154       0.1558437456
## Links_pointing_to_page           -0.339065107 -0.0229874156      -0.1984097700
## Statistical_report               -0.019102515 -0.0671530890       0.0854607583
## Result                            0.094160095  0.0574296293      -0.0679658927
##                             having_At_Symbol double_slash_redirecting
## having_IP_Address                0.158698951              0.397389087
## URL_Length                      -0.075108476             -0.081247079
## Shortining_Service               0.104446549              0.842795622
## having_At_Symbol                 1.000000000              0.086959950
## double_slash_redirecting         0.086959950              1.000000000
## Prefix_Suffix                   -0.011725539             -0.085590400
## having_Sub_Domain               -0.058975763             -0.043078808
## SSLfinal_State                   0.031220000             -0.036199897
## Domain_registeration_length      0.015521535              0.047464231
## Favicon                          0.304899055              0.035100076
## port                             0.364890515              0.025060376
## HTTPS_token                      0.104561007              0.760799398
## Request_URL                      0.027909185             -0.026367650
## URL_of_Anchor                    0.057913889             -0.005035903
## Links_in_tags                   -0.070861354             -0.125582670
## SFH                             -0.008671648             -0.041672182
## Submitting_to_email              0.370122659              0.031897965
## Abnormal_URL                     0.203944877              0.723723563
## Redirect                        -0.028159591             -0.591477929
## on_mouseover                     0.279697003              0.086634704
## RightClick                       0.219503021              0.025863086
## popUpWidnow                      0.290892843              0.054462617
## Iframe                           0.284409945              0.010458641
## age_of_domain                   -0.005499097             -0.050106635
## DNSRecord                       -0.047871518              0.431409480
## web_traffic                      0.032918391             -0.062369383
## Page_Rank                       -0.064735176             -0.003132332
## Google_Index                     0.037061311              0.178414903
## Links_pointing_to_page          -0.006080372             -0.194164634
## Statistical_report              -0.080356601              0.070389942
## Result                           0.052947789             -0.038607612
##                             Prefix_Suffix having_Sub_Domain SSLfinal_State
## having_IP_Address            -0.005256975      -0.080744639    0.071414500
## URL_Length                    0.055246716       0.003996879    0.048753733
## Shortining_Service           -0.080470553      -0.041916182   -0.061425652
## having_At_Symbol             -0.011725539      -0.058975763    0.031220000
## double_slash_redirecting     -0.085590400      -0.043078808   -0.036199897
## Prefix_Suffix                 1.000000000       0.087891090    0.261390532
## having_Sub_Domain             0.087891090       1.000000000    0.267648755
## SSLfinal_State                0.261390532       0.267648755    1.000000000
## Domain_registeration_length  -0.096798530      -0.082838574   -0.193622046
## Favicon                      -0.007504228      -0.016704372   -0.014756696
## port                         -0.022545832       0.004862571    0.027472848
## HTTPS_token                  -0.070153106      -0.037239209   -0.029941373
## Request_URL                   0.098674835       0.104856615    0.193054440
## URL_of_Anchor                 0.348871199       0.229490709    0.535786191
## Links_in_tags                 0.100253994       0.093645954    0.176825163
## SFH                           0.001325541       0.096088828    0.171402389
## Submitting_to_email          -0.045000319       0.008829840    0.008061500
## Abnormal_URL                 -0.077620303      -0.034907592   -0.046245129
## Redirect                      0.016271426       0.031205629   -0.021069747
## on_mouseover                  0.012578297      -0.018082180    0.023585818
## RightClick                   -0.024868266       0.018229873    0.015854372
## popUpWidnow                  -0.014732588      -0.025312228   -0.013004887
## Iframe                       -0.036904442       0.010636524   -0.002773194
## age_of_domain                 0.074116234       0.119253766    0.162809420
## DNSRecord                    -0.016555559       0.125493384    0.050971780
## web_traffic                   0.110597625      -0.005763910    0.258767835
## Page_Rank                    -0.006833928       0.120730067    0.074545009
## Google_Index                  0.067780621       0.057672540    0.096051392
## Links_pointing_to_page        0.067423361      -0.010525803   -0.011710227
## Statistical_report           -0.002762565       0.081627379    0.063410931
## Result                        0.348605570       0.298323324    0.714741195
##                             Domain_registeration_length       Favicon
## having_IP_Address                          -0.022739206  0.0870248294
## URL_Length                                 -0.221892431 -0.0424972200
## Shortining_Service                          0.060923194  0.0061006489
## having_At_Symbol                            0.015521535  0.3048990547
## double_slash_redirecting                    0.047464231  0.0351000758
## Prefix_Suffix                              -0.096798530 -0.0075042284
## having_Sub_Domain                          -0.082838574 -0.0167043723
## SSLfinal_State                             -0.193622046 -0.0147566962
## Domain_registeration_length                 1.000000000  0.0542534505
## Favicon                                     0.054253451  1.0000000000
## port                                        0.022477551  0.8038335605
## HTTPS_token                                 0.059161071  0.0494830293
## Request_URL                                -0.609969688 -0.0046204010
## URL_of_Anchor                              -0.160257307  0.0376978302
## Links_in_tags                              -0.101084122 -0.1003410487
## SFH                                        -0.136421980 -0.0122792023
## Submitting_to_email                         0.039260343  0.6683166076
## Abnormal_URL                                0.058108720  0.0718477294
## Redirect                                   -0.016299937 -0.0156209326
## on_mouseover                                0.023783847  0.7061793318
## RightClick                                  0.023520437  0.4143822065
## popUpWidnow                                 0.051409859  0.9396329250
## Iframe                                      0.004393378  0.6276072100
## age_of_domain                              -0.062851285 -0.0026278428
## DNSRecord                                  -0.010476736  0.0882106587
## web_traffic                                -0.134454334 -0.0509218451
## Page_Rank                                  -0.059898164  0.0116992227
## Google_Index                               -0.039765764 -0.0166677106
## Links_pointing_to_page                      0.122671569 -0.1272430958
## Statistical_report                         -0.002212040  0.3009172188
## Result                                     -0.225789462 -0.0002795247
##                                      port  HTTPS_token  Request_URL
## having_IP_Address            0.0609792061  0.363534482  0.029772867
## URL_Length                   0.0003229483 -0.089382549  0.246348096
## Shortining_Service           0.0022008313  0.757837701 -0.037234527
## having_At_Symbol             0.3648905154  0.104561007  0.027909185
## double_slash_redirecting     0.0250603764  0.760799398 -0.026367650
## Prefix_Suffix               -0.0225458322 -0.070153106  0.098674835
## having_Sub_Domain            0.0048625706 -0.037239209  0.104856615
## SSLfinal_State               0.0274728481 -0.029941373  0.193054440
## Domain_registeration_length  0.0224775512  0.059161071 -0.609969688
## Favicon                      0.8038335605  0.049483029 -0.004620401
## port                         1.0000000000  0.004998623  0.027561329
## HTTPS_token                  0.0049986228  1.000000000 -0.006619689
## Request_URL                  0.0275613290 -0.006619689  1.000000000
## URL_of_Anchor                0.0398913780  0.011850847  0.177693201
## Links_in_tags               -0.0665020356 -0.104381017  0.067491008
## SFH                          0.0066721989 -0.009679787  0.126660605
## Submitting_to_email          0.7990881593  0.075477795  0.018177694
## Abnormal_URL                 0.0541264070  0.716287367 -0.036033554
## Redirect                    -0.0224719854 -0.460164543  0.002329259
## on_mouseover                 0.6232982087  0.110113271  0.008144032
## RightClick                   0.4816308909  0.009264613 -0.020451804
## popUpWidnow                  0.7485171731  0.066956881 -0.004622095
## Iframe                       0.6870441024  0.017508903  0.016933672
## age_of_domain                0.0084587860 -0.049632114  0.090455473
## DNSRecord                    0.0548490936  0.395386634  0.015932566
## web_traffic                 -0.0285426924 -0.039708370  0.161166301
## Page_Rank                    0.0179537185  0.021104115  0.055734008
## Google_Index                -0.0054130396  0.115449951  0.046408594
## Links_pointing_to_page      -0.1391039546 -0.128724152 -0.067109020
## Statistical_report           0.3439868065  0.096186734  0.035411775
## Result                       0.0364188509 -0.039853895  0.253372272
##                             URL_of_Anchor Links_in_tags          SFH
## having_IP_Address            0.0998469553   0.006212404 -0.010962287
## URL_Length                  -0.0233955096   0.052869072  0.414196219
## Shortining_Service           0.0005614793  -0.133379116 -0.022723477
## having_At_Symbol             0.0579138894  -0.070861354 -0.008671648
## double_slash_redirecting    -0.0050359032  -0.125582670 -0.041672182
## Prefix_Suffix                0.3488711994   0.100253994  0.001325541
## having_Sub_Domain            0.2294907094   0.093645954  0.096088828
## SSLfinal_State               0.5357861908   0.176825163  0.171402389
## Domain_registeration_length -0.1602573071  -0.101084122 -0.136421980
## Favicon                      0.0376978302  -0.100341049 -0.012279202
## port                         0.0398913780  -0.066502036  0.006672199
## HTTPS_token                  0.0118508466  -0.104381017 -0.009679787
## Request_URL                  0.1776932006   0.067491008  0.126660605
## URL_of_Anchor                1.0000000000   0.136283304  0.114311328
## Links_in_tags                0.1362833037   1.000000000  0.066597600
## SFH                          0.1143113279   0.066597600  1.000000000
## Submitting_to_email          0.0333858654  -0.043231306  0.011472617
## Abnormal_URL                -0.0105852295  -0.116065466 -0.030751841
## Redirect                    -0.0008394492   0.041497199  0.049907173
## on_mouseover                 0.0677416541  -0.077669580  0.007579218
## RightClick                   0.0221681580  -0.037469195  0.008467338
## popUpWidnow                  0.0411501923  -0.112281660 -0.004862726
## Iframe                       0.0134027350  -0.070029712  0.007066971
## age_of_domain                0.0755081304   0.078056784 -0.015839916
## DNSRecord                    0.0932883535  -0.038544709  0.034439508
## web_traffic                  0.3262932300   0.064548051  0.052706363
## Page_Rank                    0.0992608715  -0.006449921  0.001978560
## Google_Index                 0.0388162297   0.045557482  0.027587678
## Links_pointing_to_page       0.0186506323   0.013561478 -0.009067706
## Statistical_report           0.0773766300  -0.087343254 -0.005288871
## Result                       0.6929345206   0.248228515  0.221419008
##                             Submitting_to_email Abnormal_URL      Redirect
## having_IP_Address                    0.07798915  0.336549357 -0.3211814194
## URL_Length                          -0.01445750 -0.106760860  0.0468322384
## Shortining_Service                   0.04932823  0.739289572 -0.5345296596
## having_At_Symbol                     0.37012266  0.203944877 -0.0281595909
## double_slash_redirecting             0.03189797  0.723723563 -0.5914779293
## Prefix_Suffix                       -0.04500032 -0.077620303  0.0162714260
## having_Sub_Domain                    0.00882984 -0.034907592  0.0312056293
## SSLfinal_State                       0.00806150 -0.046245129 -0.0210697467
## Domain_registeration_length          0.03926034  0.058108720 -0.0162999368
## Favicon                              0.66831661  0.071847729 -0.0156209326
## port                                 0.79908816  0.054126407 -0.0224719854
## HTTPS_token                          0.07547780  0.716287367 -0.4601645427
## Request_URL                          0.01817769 -0.036033554  0.0023292586
## URL_of_Anchor                        0.03338587 -0.010585229 -0.0008394492
## Links_in_tags                       -0.04323131 -0.116065466  0.0414971992
## SFH                                  0.01147262 -0.030751841  0.0499071729
## Submitting_to_email                  1.00000000  0.195850077 -0.0073206100
## Abnormal_URL                         0.19585008  1.000000000 -0.4591870655
## Redirect                            -0.00732061 -0.459187066  1.0000000000
## on_mouseover                         0.53165607  0.117637628 -0.0348225195
## RightClick                           0.39862940  0.023710312 -0.0235864240
## popUpWidnow                          0.62946227  0.091188160 -0.0263268337
## Iframe                               0.57749027  0.017590440 -0.0126676610
## age_of_domain                        0.00735732 -0.032532658 -0.0224755413
## DNSRecord                            0.06414537  0.366833384 -0.2110959875
## web_traffic                         -0.01568455 -0.052415797  0.0046314208
## Page_Rank                            0.02620836  0.007317677  0.0528666040
## Google_Index                        -0.00837762  0.124751201  0.0572302894
## Links_pointing_to_page              -0.03995573 -0.161026568  0.1612777884
## Statistical_report                   0.35207398  0.186399046 -0.0591942408
## Result                               0.01824901 -0.060487642 -0.0201134617
##                             on_mouseover   RightClick   popUpWidnow
## having_IP_Address            0.084059316  0.042881431  9.688229e-02
## URL_Length                  -0.045103038 -0.013613365 -4.938124e-02
## Shortining_Service           0.062383468  0.038118451  3.661579e-02
## having_At_Symbol             0.279697003  0.219503021  2.908928e-01
## double_slash_redirecting     0.086634704  0.025863086  5.446262e-02
## Prefix_Suffix                0.012578297 -0.024868266 -1.473259e-02
## having_Sub_Domain           -0.018082180  0.018229873 -2.531223e-02
## SSLfinal_State               0.023585818  0.015854372 -1.300489e-02
## Domain_registeration_length  0.023783847  0.023520437  5.140986e-02
## Favicon                      0.706179332  0.414382207  9.396329e-01
## port                         0.623298209  0.481630891  7.485172e-01
## HTTPS_token                  0.110113271  0.009264613  6.695688e-02
## Request_URL                  0.008144032 -0.020451804 -4.622095e-03
## URL_of_Anchor                0.067741654  0.022168158  4.115019e-02
## Links_in_tags               -0.077669580 -0.037469195 -1.122817e-01
## SFH                          0.007579218  0.008467338 -4.862726e-03
## Submitting_to_email          0.531656071  0.398629404  6.294623e-01
## Abnormal_URL                 0.117637628  0.023710312  9.118816e-02
## Redirect                    -0.034822519 -0.023586424 -2.632683e-02
## on_mouseover                 1.000000000  0.474054044  7.336288e-01
## RightClick                   0.474054044  1.000000000  4.152681e-01
## popUpWidnow                  0.733628798  0.415268089  1.000000e+00
## Iframe                       0.659478008  0.655862534  6.294063e-01
## age_of_domain                0.013305706  0.006763936 -9.482202e-04
## DNSRecord                    0.087161413  0.038254940  9.865790e-02
## web_traffic                 -0.036531254 -0.013594318 -4.319003e-02
## Page_Rank                    0.015633688  0.025341161  1.711415e-02
## Google_Index                -0.006510003 -0.008065574 -1.025554e-02
## Links_pointing_to_page      -0.038551028 -0.119831033 -1.213255e-01
## Statistical_report           0.277346725  0.204409183  2.852606e-01
## Result                       0.041838440  0.012653235  8.588679e-05
##                                   Iframe age_of_domain   DNSRecord  web_traffic
## having_IP_Address            0.054694437 -0.0104457207 -0.05073330  0.002922205
## URL_Length                  -0.013838224  0.1794264244 -0.04082337  0.008992782
## Shortining_Service           0.016580757 -0.0525958372  0.43606429 -0.047074371
## having_At_Symbol             0.284409945 -0.0054990967 -0.04787152  0.032918391
## double_slash_redirecting     0.010458641 -0.0501066349  0.43140948 -0.062369383
## Prefix_Suffix               -0.036904442  0.0741162339 -0.01655556  0.110597625
## having_Sub_Domain            0.010636524  0.1192537659  0.12549338 -0.005763910
## SSLfinal_State              -0.002773194  0.1628094196  0.05097178  0.258767835
## Domain_registeration_length  0.004393378 -0.0628512848 -0.01047674 -0.134454334
## Favicon                      0.627607210 -0.0026278428  0.08821066 -0.050921845
## port                         0.687044102  0.0084587860  0.05484909 -0.028542692
## HTTPS_token                  0.017508903 -0.0496321141  0.39538663 -0.039708370
## Request_URL                  0.016933672  0.0904554734  0.01593257  0.161166301
## URL_of_Anchor                0.013402735  0.0755081304  0.09328835  0.326293230
## Links_in_tags               -0.070029712  0.0780567844 -0.03854471  0.064548051
## SFH                          0.007066971 -0.0158399160  0.03443951  0.052706363
## Submitting_to_email          0.577490269  0.0073573196  0.06414537 -0.015684545
## Abnormal_URL                 0.017590440 -0.0325326580  0.36683338 -0.052415797
## Redirect                    -0.012667661 -0.0224755413 -0.21109599  0.004631421
## on_mouseover                 0.659478008  0.0133057058  0.08716141 -0.036531254
## RightClick                   0.655862534  0.0067639361  0.03825494 -0.013594318
## popUpWidnow                  0.629406251 -0.0009482202  0.09865790 -0.043190028
## Iframe                       1.000000000  0.0188476779  0.04729313 -0.022079555
## age_of_domain                0.018847678  1.0000000000 -0.03408235  0.089948950
## DNSRecord                    0.047293133 -0.0340823497  1.00000000  0.048649503
## web_traffic                 -0.022079555  0.0899489497  0.04864950  1.000000000
## Page_Rank                    0.022407265 -0.1471935455  0.13786020  0.030984366
## Google_Index                -0.003519028 -0.0284714597  0.13750903 -0.012583857
## Links_pointing_to_page      -0.140823687  0.0404074371 -0.31826599 -0.019859996
## Statistical_report           0.268417804  0.0091150444  0.13685990  0.009222875
## Result                      -0.003393524  0.1214964165  0.07571775  0.346103108
##                                Page_Rank Google_Index Links_pointing_to_page
## having_IP_Address           -0.091773751  0.029152889           -0.339065107
## URL_Length                   0.183517973  0.002902415           -0.022987416
## Shortining_Service           0.014591371  0.155843746           -0.198409770
## having_At_Symbol            -0.064735176  0.037061311           -0.006080372
## double_slash_redirecting    -0.003132332  0.178414903           -0.194164634
## Prefix_Suffix               -0.006833928  0.067780621            0.067423361
## having_Sub_Domain            0.120730067  0.057672540           -0.010525803
## SSLfinal_State               0.074545009  0.096051392           -0.011710227
## Domain_registeration_length -0.059898164 -0.039765764            0.122671569
## Favicon                      0.011699223 -0.016667711           -0.127243096
## port                         0.017953719 -0.005413040           -0.139103955
## HTTPS_token                  0.021104115  0.115449951           -0.128724152
## Request_URL                  0.055734008  0.046408594           -0.067109020
## URL_of_Anchor                0.099260872  0.038816230            0.018650632
## Links_in_tags               -0.006449921  0.045557482            0.013561478
## SFH                          0.001978560  0.027587678           -0.009067706
## Submitting_to_email          0.026208360 -0.008377620           -0.039955727
## Abnormal_URL                 0.007317677  0.124751201           -0.161026568
## Redirect                     0.052866604  0.057230289            0.161277788
## on_mouseover                 0.015633688 -0.006510003           -0.038551028
## RightClick                   0.025341161 -0.008065574           -0.119831033
## popUpWidnow                  0.017114154 -0.010255539           -0.121325462
## Iframe                       0.022407265 -0.003519028           -0.140823687
## age_of_domain               -0.147193545 -0.028471460            0.040407437
## DNSRecord                    0.137860200  0.137509033           -0.318265990
## web_traffic                  0.030984366 -0.012583857           -0.019859996
## Page_Rank                    1.000000000  0.032431230           -0.028215509
## Google_Index                 0.032431230  1.000000000           -0.038776791
## Links_pointing_to_page      -0.028215509 -0.038776791            1.000000000
## Statistical_report           0.031048613 -0.005102843           -0.016817478
## Result                       0.104644905  0.128950452            0.032573899
##                             Statistical_report        Result
## having_IP_Address                 -0.019102515  9.416009e-02
## URL_Length                        -0.067153089  5.742963e-02
## Shortining_Service                 0.085460758 -6.796589e-02
## having_At_Symbol                  -0.080356601  5.294779e-02
## double_slash_redirecting           0.070389942 -3.860761e-02
## Prefix_Suffix                     -0.002762565  3.486056e-01
## having_Sub_Domain                  0.081627379  2.983233e-01
## SSLfinal_State                     0.063410931  7.147412e-01
## Domain_registeration_length       -0.002212040 -2.257895e-01
## Favicon                            0.300917219 -2.795247e-04
## port                               0.343986806  3.641885e-02
## HTTPS_token                        0.096186734 -3.985390e-02
## Request_URL                        0.035411775  2.533723e-01
## URL_of_Anchor                      0.077376630  6.929345e-01
## Links_in_tags                     -0.087343254  2.482285e-01
## SFH                               -0.005288871  2.214190e-01
## Submitting_to_email                0.352073976  1.824901e-02
## Abnormal_URL                       0.186399046 -6.048764e-02
## Redirect                          -0.059194241 -2.011346e-02
## on_mouseover                       0.277346725  4.183844e-02
## RightClick                         0.204409183  1.265323e-02
## popUpWidnow                        0.285260615  8.588679e-05
## Iframe                             0.268417804 -3.393524e-03
## age_of_domain                      0.009115044  1.214964e-01
## DNSRecord                          0.136859898  7.571775e-02
## web_traffic                        0.009222875  3.461031e-01
## Page_Rank                          0.031048613  1.046449e-01
## Google_Index                      -0.005102843  1.289505e-01
## Links_pointing_to_page            -0.016817478  3.257390e-02
## Statistical_report                 1.000000000  7.985672e-02
## Result                             0.079856718  1.000000e+00
#Plot correlation matrix

corrplot(correlationMatrix, type = "upper",order = "hclust",col=brewer.pal(n=8,name= "RdYlBu"),tl.cex=0.5)

# find attributes that are highly corrected (ideally >0.75)

highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.8,verbose = TRUE)
## Compare row 22  and column  10 with corr  0.94 
##   Means:  0.188 vs 0.118 so flagging column 22 
## Compare row 10  and column  11 with corr  0.804 
##   Means:  0.161 vs 0.114 so flagging column 10 
## Compare row 5  and column  3 with corr  0.843 
##   Means:  0.182 vs 0.109 so flagging column 5 
## All correlations <= 0.8
# print indexes of highly correlated attributes

print(highlyCorrelated)
## [1] 22 10  5
# From the output we can see that columns SSLfinal_State, URL_of_Anchor, web_traffic, having_Sub_Domain, Domain_registeration_length,Request_URL are highly correlated with Result variable

# Plotting the relationships between important features and target variable

# Plot Result vs SSLfinal_State
ggplot(df_phi, aes(x=SSLfinal_State, fill=Result)) + geom_bar(position="dodge")

# Plot Result vs URL_of_Anchor
ggplot(df_phi, aes(x=URL_of_Anchor, fill=Result)) + geom_bar(position="dodge")

# Plot Result vs web_traffic
ggplot(df_phi, aes(x=web_traffic, fill=Result)) + geom_bar(position="dodge")

# Plot Result vs having_Sub_Domain
ggplot(df_phi, aes(x=having_Sub_Domain, fill=Result)) + geom_bar(position="dodge")

# Plot Result vs Domain_registeration_length
ggplot(df_phi, aes(x=Domain_registeration_length, fill=Result)) + geom_bar(position="dodge")

# Plot Result vs Request_URL
ggplot(df_phi, aes(x=Request_URL, fill=Result)) + geom_bar(position="dodge")

Converting the Result field to factor variable

df_phi$Result <- as.factor(df_phi$Result) # Converting the column to a factor variable

Developing and evaluating the Decision tree model using cross validation techniques

# Splitting the dataset into train and test using random sample
set.seed(1234)
sample <- sample(c(TRUE, FALSE), nrow(df_phi), replace=TRUE, prob=c(0.8,0.2))
phi_train  <- df_phi[sample, ]
phi_test   <- df_phi[!sample, ]
head(phi_train)
head(phi_test)
# constructing Decision tree 
DT_phi <- rpart(Result ~ ., data=phi_train,parms = list(split="information") ,method="class")
summary(DT_phi)
## Call:
## rpart(formula = Result ~ ., data = phi_train, method = "class", 
##     parms = list(split = "information"))
##   n= 8856 
## 
##           CP nsplit rel error    xerror        xstd
## 1 0.74540347      0 1.0000000 1.0000000 0.011935025
## 2 0.03728294      1 0.2545965 0.2545965 0.007595734
## 3 0.01000000      2 0.2173136 0.2173136 0.007082457
## 
## Variable importance
##              SSLfinal_State               URL_of_Anchor 
##                          44                          31 
##                 web_traffic           having_Sub_Domain 
##                           9                           7 
## Domain_registeration_length                 Request_URL 
##                           5                           4 
## 
## Node number 1: 8856 observations,    complexity param=0.7454035
##   predicted class=1   expected loss=0.4421861  P(node) =1
##     class counts:  3916  4940
##    probabilities: 0.442 0.558 
##   left son=2 (3795 obs) right son=3 (5061 obs)
##   Primary splits:
##       SSLfinal_State    < 0.5  to the left,  improve=2963.2510, (0 missing)
##       URL_of_Anchor     < -0.5 to the left,  improve=2688.9690, (0 missing)
##       Prefix_Suffix     < 0    to the left,  improve= 752.3226, (0 missing)
##       web_traffic       < 0.5  to the left,  improve= 704.0302, (0 missing)
##       having_Sub_Domain < 0.5  to the left,  improve= 670.4705, (0 missing)
##   Surrogate splits:
##       URL_of_Anchor               < -0.5 to the left,  agree=0.821, adj=0.582, (0 split)
##       web_traffic                 < 0.5  to the left,  agree=0.657, adj=0.200, (0 split)
##       having_Sub_Domain           < 0.5  to the left,  agree=0.643, adj=0.166, (0 split)
##       Domain_registeration_length < 0    to the right, agree=0.619, adj=0.112, (0 split)
##       Request_URL                 < 0    to the left,  agree=0.614, adj=0.100, (0 split)
## 
## Node number 2: 3795 observations
##   predicted class=-1  expected loss=0.115415  P(node) =0.428523
##     class counts:  3357   438
##    probabilities: 0.885 0.115 
## 
## Node number 3: 5061 observations,    complexity param=0.03728294
##   predicted class=1   expected loss=0.1104525  P(node) =0.571477
##     class counts:   559  4502
##    probabilities: 0.110 0.890 
##   left son=6 (208 obs) right son=7 (4853 obs)
##   Primary splits:
##       URL_of_Anchor     < -0.5 to the left,  improve=333.3481, (0 missing)
##       web_traffic       < 0.5  to the left,  improve=165.9506, (0 missing)
##       Prefix_Suffix     < 0    to the left,  improve=143.7806, (0 missing)
##       Links_in_tags     < -0.5 to the left,  improve=117.9490, (0 missing)
##       having_Sub_Domain < 0.5  to the left,  improve=114.6232, (0 missing)
## 
## Node number 6: 208 observations
##   predicted class=-1  expected loss=0.1490385  P(node) =0.0234869
##     class counts:   177    31
##    probabilities: 0.851 0.149 
## 
## Node number 7: 4853 observations
##   predicted class=1   expected loss=0.0787142  P(node) =0.5479901
##     class counts:   382  4471
##    probabilities: 0.079 0.921
# Plotting decision tree using rpart.plot()
rpart.plot(DT_phi, main="Decision Tree for Website phishing")

# Feature evaluation of decision tree
 phi_feature <- data.frame(imp = DT_phi$variable.importance)
phi_feature1 <- phi_feature %>% 
  tibble::rownames_to_column() %>% 
  dplyr::rename("variable" = rowname) %>% 
  dplyr::arrange(imp) %>%
  dplyr::mutate(variable = forcats::fct_inorder(variable))

ggplot2::ggplot(phi_feature1) +
  geom_col(aes(x = variable, y = imp),
           col = "black", show.legend = F) +
  coord_flip() +
  scale_fill_grey() +
  theme_bw()

# The important features predicted from decision tree are SSLfinal_State, URL_of_Anchor, web_traffic, having_Sub_Domain, Domain_registeration_length,Request_URL


######################
# Predicting the model on train data

phi_predict_train <-predict(DT_phi, phi_train, type = 'class')
phi_table_train <- table(phi_train$Result, phi_predict_train)
phi_table_train
##     phi_predict_train
##        -1    1
##   -1 3534  382
##   1   469 4471
# Predicting the model on test data

phi_predict_test <-predict(DT_phi, phi_test, type = 'class')
phi_table_test <- table(phi_test$Result, phi_predict_test)
phi_table_test
##     phi_predict_test
##        -1    1
##   -1  891   91
##   1    94 1123
##################
# Confusion matrix to calculate the performance of the decision tree

#Confusion Matrix for train data
confusionMatrix(phi_table_train,reference = phi_train$Result)
## Confusion Matrix and Statistics
## 
##     phi_predict_train
##        -1    1
##   -1 3534  382
##   1   469 4471
##                                         
##                Accuracy : 0.9039        
##                  95% CI : (0.8976, 0.91)
##     No Information Rate : 0.548         
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.8057        
##                                         
##  Mcnemar's Test P-Value : 0.003198      
##                                         
##             Sensitivity : 0.8828        
##             Specificity : 0.9213        
##          Pos Pred Value : 0.9025        
##          Neg Pred Value : 0.9051        
##              Prevalence : 0.4520        
##          Detection Rate : 0.3991        
##    Detection Prevalence : 0.4422        
##       Balanced Accuracy : 0.9021        
##                                         
##        'Positive' Class : -1            
## 
#Findings :
# 1. Accuracy of the decision tree on training data is 90.3%
# 2. Sensitivity of the decision tree on training data is 0.88
# 3. Specificity of the decision tree on training data is 0.92

#Confusion Matrix for test data
confusionMatrix(phi_table_test,reference = phi_test$Result)
## Confusion Matrix and Statistics
## 
##     phi_predict_test
##        -1    1
##   -1  891   91
##   1    94 1123
##                                           
##                Accuracy : 0.9159          
##                  95% CI : (0.9035, 0.9271)
##     No Information Rate : 0.5521          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8298          
##                                           
##  Mcnemar's Test P-Value : 0.8831          
##                                           
##             Sensitivity : 0.9046          
##             Specificity : 0.9250          
##          Pos Pred Value : 0.9073          
##          Neg Pred Value : 0.9228          
##              Prevalence : 0.4479          
##          Detection Rate : 0.4052          
##    Detection Prevalence : 0.4466          
##       Balanced Accuracy : 0.9148          
##                                           
##        'Positive' Class : -1              
## 
#Findings :
# 1. Accuracy of the decision tree on test data is 91.56%
# 2. Sensitivity of the decision tree on test data is 0.90
# 3. Specificity of the decision tree on test data is 0.92

###################

# Cross validation of decision tree on test data

Validation_DT_model <- data.frame( R2 = R2(as.numeric(phi_predict_test), as.numeric(phi_test$Result)),
            RMSE = RMSE(as.numeric(phi_predict_test), as.numeric(phi_test$Result)),
            MAE = MAE(as.numeric(phi_predict_test), as.numeric(phi_test$Result)))
Validation_DT_model
# Findings :
# 1. R2 Error is 0.688
# 2. Mean Absolute Error (MAE) is 0.29
# 3. Root Mean Squared Error (RMSE) is 0.084

Random forest model and evaluation

# Set a random seed
set.seed(51)

# Training the model using random forest model

rf_model <- randomForest(formula = Result ~ .,data= phi_train,ntree=1000,nodesize = 10)
rf_model
## 
## Call:
##  randomForest(formula = Result ~ ., data = phi_train, ntree = 1000,      nodesize = 10) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 5
## 
##         OOB estimate of  error rate: 3.9%
## Confusion matrix:
##      -1    1 class.error
## -1 3702  214  0.05464760
## 1   131 4809  0.02651822
# Predicting the Test set results
rf_pred = predict(rf_model, newdata = phi_test)
rf_table_test <- table(phi_test$Result, rf_pred)

  
# Confusion Matrix
confusionMatrix(rf_table_test,reference = phi_test$Result)
## Confusion Matrix and Statistics
## 
##     rf_pred
##        -1    1
##   -1  939   43
##   1    32 1185
##                                           
##                Accuracy : 0.9659          
##                  95% CI : (0.9574, 0.9731)
##     No Information Rate : 0.5584          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9309          
##                                           
##  Mcnemar's Test P-Value : 0.2482          
##                                           
##             Sensitivity : 0.9670          
##             Specificity : 0.9650          
##          Pos Pred Value : 0.9562          
##          Neg Pred Value : 0.9737          
##              Prevalence : 0.4416          
##          Detection Rate : 0.4270          
##    Detection Prevalence : 0.4466          
##       Balanced Accuracy : 0.9660          
##                                           
##        'Positive' Class : -1              
## 
#Findings :
# 1. Accuracy of the random forest model on test data is 96.63%
# 2. Sensitivity of the random forest model on test data is 0.967
# 3. Specificity of the random forest model on test data is 0.956

plot(rf_model)

# Importance plot
importance(rf_model)
##                             MeanDecreaseGini
## having_IP_Address                  34.949481
## URL_Length                         25.041544
## Shortining_Service                 16.476935
## having_At_Symbol                   11.745501
## double_slash_redirecting           12.622672
## Prefix_Suffix                     175.333063
## having_Sub_Domain                 222.216222
## SSLfinal_State                   1357.679219
## Domain_registeration_length        52.152567
## Favicon                            13.330742
## port                                8.198116
## HTTPS_token                        17.598783
## Request_URL                        63.905924
## URL_of_Anchor                    1025.334523
## Links_in_tags                     141.077161
## SFH                                69.485551
## Submitting_to_email                14.958272
## Abnormal_URL                       13.813237
## Redirect                           15.960494
## on_mouseover                       10.335438
## RightClick                          3.527376
## popUpWidnow                        15.436528
## Iframe                              7.147825
## age_of_domain                      34.651000
## DNSRecord                          37.043953
## web_traffic                       269.737583
## Page_Rank                          29.464816
## Google_Index                       38.460288
## Links_pointing_to_page             50.621932
## Statistical_report                 12.634788
# Variable importance plot
varImpPlot(rf_model)

# Cross validation 

Validation_rf_model <- data.frame( R2 = R2(as.numeric(rf_pred), as.numeric(phi_test$Result)),
            RMSE = RMSE(as.numeric(rf_pred), as.numeric(phi_test$Result)),
            MAE = MAE(as.numeric(rf_pred), as.numeric(phi_test$Result)))
Validation_rf_model
# Findings :
# 1. R2 Error is 0.868 high
# 2. Mean Absolute Error (MAE) is 0.183 Low
# 3. Root Mean Squared Error (RMSE) is 0.0336 Low

We would be selecting the random forest model over decision tree because of following reasons 1. Random forest has higher R2 (0.86) than decision tree (0.688).Better model should have a high value of R-squared so Random forest is the better model 2. Mean Absolute Error (MAE) of random forest is lower(0.183) than decision tree(0.29). Better model should have low value of MAE so choosing random forest model 3. Root Mean Squared Error (RMSE) of random forest is lower than decsion tree so it the better model 4. Accuracy of random forest model is 5% more than decision tree model so we conclude that random forest is the better model